tensor_stream 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.rubocop.yml +6 -1
- data/CHANGELOG.md +10 -0
- data/README.md +35 -0
- data/lib/tensor_stream.rb +2 -2
- data/lib/tensor_stream/debugging/debugging.rb +2 -1
- data/lib/tensor_stream/dynamic_stitch.rb +23 -24
- data/lib/tensor_stream/evaluator/base_evaluator.rb +27 -18
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_momentum.cl +16 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/pack.cl +24 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +6 -1
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +6 -6
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +237 -107
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +97 -7
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +230 -123
- data/lib/tensor_stream/exceptions.rb +1 -0
- data/lib/tensor_stream/graph_builder.rb +2 -3
- data/lib/tensor_stream/graph_deserializers/protobuf.rb +22 -23
- data/lib/tensor_stream/graph_serializers/graphml.rb +26 -29
- data/lib/tensor_stream/graph_serializers/pbtext.rb +22 -19
- data/lib/tensor_stream/helpers/string_helper.rb +4 -5
- data/lib/tensor_stream/math_gradients.rb +141 -77
- data/lib/tensor_stream/nn/nn_ops.rb +4 -6
- data/lib/tensor_stream/operation.rb +139 -120
- data/lib/tensor_stream/ops.rb +36 -3
- data/lib/tensor_stream/session.rb +7 -11
- data/lib/tensor_stream/tensor.rb +3 -3
- data/lib/tensor_stream/tensor_shape.rb +5 -0
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +4 -37
- data/lib/tensor_stream/train/momentum_optimizer.rb +48 -0
- data/lib/tensor_stream/train/optimizer.rb +129 -0
- data/lib/tensor_stream/train/saver.rb +0 -1
- data/lib/tensor_stream/train/slot_creator.rb +62 -0
- data/lib/tensor_stream/train/utils.rb +11 -12
- data/lib/tensor_stream/trainer.rb +3 -0
- data/lib/tensor_stream/utils.rb +18 -11
- data/lib/tensor_stream/variable.rb +19 -12
- data/lib/tensor_stream/variable_scope.rb +1 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/iris.rb +2 -1
- data/samples/linear_regression.rb +3 -1
- data/samples/nearest_neighbor.rb +2 -0
- data/test_samples/neural_network_raw.py +101 -0
- data/test_samples/raw_neural_net_sample.rb +6 -4
- data/test_samples/test2.py +73 -27
- metadata +9 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f84c2b9852fcf4931c47c0130b67497a50a87b0f
|
4
|
+
data.tar.gz: 524e1105da4e06e3472cbcfa0e6f764ae4512d37
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 420e2675ab67d4c8462534bdf8c703671656f7852d984579e22ee57f1425dd5740fcb64a1e52363bf337cd7c691d87a75c76bd868b13c8a7f06d78e0eb00aa73
|
7
|
+
data.tar.gz: 24fe1022741883d46cdd5af51309da33d421d72874f0cc84bf2e0ed14a62602f1830c6060bd86e42359b7962b4a57727c9a48ce13d5950d5ba02f6a9cdfd719f
|
data/.rubocop.yml
CHANGED
@@ -6,6 +6,10 @@ AllCops:
|
|
6
6
|
- tensor_stream.gemspec
|
7
7
|
- Rakefile
|
8
8
|
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Exclude:
|
11
|
+
- lib/tensor_stream/utils.rb
|
12
|
+
|
9
13
|
Style/StringLiterals:
|
10
14
|
Enabled: false
|
11
15
|
|
@@ -81,4 +85,5 @@ Style/TrailingCommaInHashLiteral:
|
|
81
85
|
|
82
86
|
Naming/UncommunicativeMethodParamName:
|
83
87
|
Exclude:
|
84
|
-
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
88
|
+
- lib/tensor_stream/evaluator/ruby_evaluator.rb
|
89
|
+
- lib/tensor_stream/ops.rb
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
+
## [0.8.0] - 2018-08-29
|
8
|
+
### Added
|
9
|
+
- [TRAINING] Added new supported optimizer, MomentumOptimizer loosely based on tensorflow's implementation (with nesterov support)
|
10
|
+
- [NEW OP] fill, stack, atan, cumprod, gather, invert_permutation, setdiff1d
|
11
|
+
|
12
|
+
### Fixes
|
13
|
+
- Fixed device delegator where it does not pick the correct evaluator to use in some cases
|
14
|
+
- [GRADIENTS] Properly implement gradient computation for prod, tile, transpose
|
15
|
+
- Fixed gradient computation for softmax_cross_entropy_with_logits_v2 (now based on tensorflow's implementation)
|
16
|
+
|
7
17
|
## [0.7.0] - 2018-08-08
|
8
18
|
### Added
|
9
19
|
- [NEW OP] expand_dims, min, acos, asin, add_n
|
data/README.md
CHANGED
@@ -324,6 +324,41 @@ result = a + b
|
|
324
324
|
File.write("model.pbtext", result.graph.as_graph_def)
|
325
325
|
```
|
326
326
|
|
327
|
+
## Performance notes
|
328
|
+
|
329
|
+
Comparative performance with respect to other ruby libraries have not yet been performed. However it is
|
330
|
+
notable that TruffleRuby and ruby-2.6.0-preview2 with the --jit flag performs considerably better with respect
|
331
|
+
to previous versions of ruby(< 2.6)
|
332
|
+
|
333
|
+
Benchmarks running samples/linear_regression.rb on an Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
|
334
|
+
|
335
|
+
ruby 2.4
|
336
|
+
|
337
|
+
```
|
338
|
+
$ ruby -v
|
339
|
+
ruby 2.4.0p0 (2016-12-24 revision 57164) [x86_64-linux]
|
340
|
+
$ ruby samples/linear_regression.rb
|
341
|
+
495 seconds 1000 epochs
|
342
|
+
```
|
343
|
+
|
344
|
+
ruby 2.6.0-preview2
|
345
|
+
|
346
|
+
```
|
347
|
+
$ ruby -v
|
348
|
+
ruby 2.6.0preview2 (2018-05-31 trunk 63539) [x86_64-linux]
|
349
|
+
$ ruby --jit samples/linear_regression.rb
|
350
|
+
394 seconds 10000 epochs
|
351
|
+
```
|
352
|
+
|
353
|
+
truffleruby
|
354
|
+
```
|
355
|
+
$ ruby -v
|
356
|
+
truffleruby 1.0.0-rc5, like ruby 2.4.4, GraalVM CE Native [x86_64-linux]
|
357
|
+
219 seconds 10000 epochs
|
358
|
+
```
|
359
|
+
|
360
|
+
For training large networks that works on images, the opencl evaluator is the only way to go.
|
361
|
+
|
327
362
|
## Roadmap
|
328
363
|
|
329
364
|
- Docs
|
data/lib/tensor_stream.rb
CHANGED
@@ -20,8 +20,6 @@ require 'tensor_stream/operation'
|
|
20
20
|
require 'tensor_stream/placeholder'
|
21
21
|
require 'tensor_stream/control_flow'
|
22
22
|
require 'tensor_stream/dynamic_stitch'
|
23
|
-
require 'tensor_stream/train/utils'
|
24
|
-
require 'tensor_stream/trainer'
|
25
23
|
require 'tensor_stream/nn/nn_ops'
|
26
24
|
require 'tensor_stream/evaluator/evaluator'
|
27
25
|
require 'tensor_stream/graph_serializers/serializer'
|
@@ -31,6 +29,8 @@ require 'tensor_stream/graph_serializers/graphml'
|
|
31
29
|
require 'tensor_stream/math_gradients'
|
32
30
|
require "tensor_stream/debugging/debugging"
|
33
31
|
require 'tensor_stream/utils'
|
32
|
+
require 'tensor_stream/train/utils'
|
33
|
+
require 'tensor_stream/trainer'
|
34
34
|
|
35
35
|
# require 'tensor_stream/libraries/layers'
|
36
36
|
require 'tensor_stream/monkey_patches/integer'
|
@@ -9,8 +9,9 @@ module TensorStream
|
|
9
9
|
nodes_to_process.each do |node|
|
10
10
|
node.inputs = node.inputs.collect do |input|
|
11
11
|
next if input.nil?
|
12
|
+
next input if input.is_a?(Variable)
|
12
13
|
|
13
|
-
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
|
14
|
+
if input.is_a?(Tensor) && TensorStream::Ops::FLOATING_POINT_TYPES.include?(input.data_type)
|
14
15
|
TensorStream.check_numerics(input, "#{node.name}/#{input.name}", name: "check/#{node.name}/#{input.name}" )
|
15
16
|
else
|
16
17
|
input
|
@@ -1,28 +1,27 @@
|
|
1
1
|
module TensorStream
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
def initialize(flow_type, inputs, ops = nil, options = {})
|
7
|
-
setup_initial_state(options)
|
8
|
-
|
9
|
-
@operation = :"flow_#{flow_type}"
|
10
|
-
@inputs = inputs
|
2
|
+
# Defines a TensorStream controlflow op
|
3
|
+
class DynamicStitch < Operation
|
4
|
+
attr_accessor :ops
|
11
5
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
6
|
+
def initialize(flow_type, inputs, ops = nil, options = {})
|
7
|
+
setup_initial_state(options)
|
8
|
+
|
9
|
+
@operation = :"flow_#{flow_type}"
|
10
|
+
@inputs = inputs
|
11
|
+
|
12
|
+
@data_type = Tensor.detect_type(inputs[1])
|
13
|
+
@name = [@graph.get_name_scope, options[:name] || set_name].compact.join('/')
|
14
|
+
@ops = ops
|
15
|
+
@shape = TensorShape.new([inputs.size])
|
16
|
+
@graph.add_node(self)
|
17
|
+
end
|
18
|
+
|
19
|
+
def set_data_type(_passed_data_type)
|
20
|
+
:unknown
|
21
|
+
end
|
22
|
+
|
23
|
+
def run
|
24
|
+
eval
|
26
25
|
end
|
27
26
|
end
|
28
|
-
|
27
|
+
end
|
@@ -2,13 +2,14 @@ module TensorStream
|
|
2
2
|
# Evaluator base module
|
3
3
|
module Evaluator
|
4
4
|
class OutputGroup
|
5
|
-
attr_accessor :outputs
|
6
|
-
def initialize(outputs = [])
|
5
|
+
attr_accessor :outputs, :data_types
|
6
|
+
def initialize(outputs = [], data_types = [])
|
7
7
|
@outputs = outputs
|
8
|
+
@data_types = data_types
|
8
9
|
end
|
9
10
|
end
|
10
11
|
|
11
|
-
class UnsupportedOp <
|
12
|
+
class UnsupportedOp < RuntimeError
|
12
13
|
def initialize(tensor)
|
13
14
|
@tensor = tensor
|
14
15
|
end
|
@@ -111,22 +112,13 @@ module TensorStream
|
|
111
112
|
|
112
113
|
resolved_inputs = tensor.inputs.map do |i|
|
113
114
|
next if i.nil?
|
115
|
+
next i if op_options[:noop]
|
114
116
|
|
115
117
|
if i.is_a?(Array)
|
116
|
-
next i.collect { |sub_item| sub_item.is_a?(Tensor) ?
|
118
|
+
next i.collect { |sub_item| sub_item.is_a?(Tensor) ? global_eval(tensor, sub_item, execution_context) : sub_item }
|
117
119
|
end
|
118
120
|
|
119
|
-
|
120
|
-
cache_key = "#{tensor.graph.object_id}_#{i.name}:#{object_id}"
|
121
|
-
next @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
122
|
-
|
123
|
-
result = @session.delegate_to_evaluator(i, @context, execution_context)
|
124
|
-
convert_from_buffer(i, result).tap do |buffer|
|
125
|
-
@context[:_cache][cache_key] = buffer if i.is_const
|
126
|
-
end
|
127
|
-
else
|
128
|
-
prepare_input(i, execution_context, op_options)
|
129
|
-
end
|
121
|
+
global_eval(tensor, i, execution_context, op_options)
|
130
122
|
end
|
131
123
|
|
132
124
|
instance_exec(execution_context, tensor, resolved_inputs, &op[:block])
|
@@ -134,6 +126,23 @@ module TensorStream
|
|
134
126
|
|
135
127
|
protected
|
136
128
|
|
129
|
+
def global_eval(tensor, input, execution_context, op_options = {})
|
130
|
+
return nil unless input
|
131
|
+
return input unless input.is_a?(Tensor)
|
132
|
+
|
133
|
+
if object_id != @context[:_cache][:placement][input.name][1].object_id # tensor is on another device or evaluator
|
134
|
+
cache_key = "#{tensor.graph.object_id}_#{input.name}:#{object_id}"
|
135
|
+
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
136
|
+
|
137
|
+
result = @session.delegate_to_evaluator(input, @context, execution_context)
|
138
|
+
convert_from_buffer(input, result).tap do |buffer|
|
139
|
+
@context[:_cache][cache_key] = buffer if input.is_const
|
140
|
+
end
|
141
|
+
else
|
142
|
+
prepare_input(input, execution_context, op_options)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
137
146
|
def get_broadcast_gradient_args(input_a, input_b)
|
138
147
|
return [[], []] if input_a == input_b
|
139
148
|
|
@@ -153,16 +162,16 @@ module TensorStream
|
|
153
162
|
end
|
154
163
|
end
|
155
164
|
|
156
|
-
|
165
|
+
[input_a_args.reverse, input_b_args.reverse]
|
157
166
|
end
|
158
167
|
|
159
168
|
##
|
160
169
|
# converts from a ruby Buffer object to the evaluator's native buffer format
|
161
|
-
def convert_from_buffer(
|
170
|
+
def convert_from_buffer(_tensor, _result)
|
162
171
|
raise "need implementation"
|
163
172
|
end
|
164
173
|
|
165
|
-
def prepare_input(
|
174
|
+
def prepare_input(_tensor, _context, _options = {})
|
166
175
|
raise "need implementation"
|
167
176
|
end
|
168
177
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_momentum_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
|
+
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
|
+
// Get the index of the current element to be processed
|
6
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
7
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
+
const int index = globalRow * N + globalCol;
|
9
|
+
<%= c_dtype %> acc_m = acc[index];
|
10
|
+
acc[index] = acc_m * momentum[0] + grad[index];
|
11
|
+
<% if nesterov %>
|
12
|
+
output[index] -= grad[index] * learning_rate[0] + acc_m * momentum[0] * learning_rate[0];
|
13
|
+
<% else %>
|
14
|
+
output[index] -= acc_m * learning_rate[0];
|
15
|
+
<% end %>
|
16
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
int start = index * <%= divisors[0] %>;
|
8
|
+
int ptr = start + globalCol;
|
9
|
+
int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> };
|
10
|
+
|
11
|
+
// compute effective coordinates
|
12
|
+
<% divisors.each_with_index do |div, index| %>
|
13
|
+
index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
14
|
+
|
15
|
+
// Apply axis translation if needed
|
16
|
+
<% if axis > 0 %>
|
17
|
+
int first = index_map[0];
|
18
|
+
<% axis.times do |i| %>
|
19
|
+
index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %>
|
20
|
+
index_map[<%= axis %>] = first;
|
21
|
+
<% end%>
|
22
|
+
|
23
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol];
|
24
|
+
}
|
@@ -1,9 +1,10 @@
|
|
1
|
+
|
1
2
|
// First naive implementation
|
2
3
|
% c_dtype = dtype_to_c_type(dtype)
|
3
4
|
__kernel void softmax_cross_<%= dtype %>(const int N,
|
4
5
|
const __global <%= c_dtype %>* A,
|
5
6
|
const __global <%= c_dtype %>* L,
|
6
|
-
__global <%= c_dtype %>* C) {
|
7
|
+
__global <%= c_dtype %>* C, __global <%= c_dtype %>* P) {
|
7
8
|
|
8
9
|
// Get the index of the current element to be processed
|
9
10
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
@@ -24,4 +25,8 @@ __kernel void softmax_cross_<%= dtype %>(const int N,
|
|
24
25
|
for (int k=0; k < N; k++) {
|
25
26
|
C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
|
26
27
|
}
|
28
|
+
|
29
|
+
for (int k=0; k < N; k++) {
|
30
|
+
P[globalRow*N + k] = (exp(A[globalRow*N + k] - max) / acc) - L[globalRow*N + k];
|
31
|
+
}
|
27
32
|
}
|
@@ -16,10 +16,6 @@ module TensorStream
|
|
16
16
|
|
17
17
|
def to_ruby
|
18
18
|
return [] if buffer.empty?
|
19
|
-
if shape.empty?
|
20
|
-
return buffer[0] != 0 if data_type == :boolean
|
21
|
-
return buffer[0]
|
22
|
-
end
|
23
19
|
|
24
20
|
if dirty
|
25
21
|
op.command_queue.enqueue_read_buffer(cl_buffer, buffer, event_wait_list: [op].compact)
|
@@ -27,9 +23,13 @@ module TensorStream
|
|
27
23
|
self.dirty = false
|
28
24
|
end
|
29
25
|
|
26
|
+
if shape.empty?
|
27
|
+
return buffer[0] != 0 if data_type == :boolean
|
28
|
+
return buffer[0]
|
29
|
+
end
|
30
|
+
|
30
31
|
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
31
|
-
|
32
|
-
result
|
32
|
+
data_type == :boolean ? process_function_op(result, ->(a, _b) { a != 0 }) : result
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
@@ -30,6 +30,7 @@ module TensorStream
|
|
30
30
|
## PURE ruby evaluator used for testing and development
|
31
31
|
class OpenclEvaluator < BaseEvaluator
|
32
32
|
attr_accessor :retain
|
33
|
+
attr_reader :opencl_device
|
33
34
|
|
34
35
|
include TensorStream::OpHelper
|
35
36
|
include TensorStream::ArrayOpsHelper
|
@@ -51,20 +52,20 @@ module TensorStream
|
|
51
52
|
|
52
53
|
def self.fetch_device(query = [])
|
53
54
|
devices = query_devices_with_score
|
54
|
-
platform_devices = devices.select { |d| d[0].platform.to_s.
|
55
|
+
platform_devices = devices.select { |d| d[0].platform.to_s.tr(' ', '_').downcase =~ /#{query[0].downcase}/ }
|
55
56
|
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
56
57
|
end
|
57
58
|
|
58
59
|
def self.opencl_to_device(d)
|
59
60
|
device = d[0]
|
60
61
|
index = d[3]
|
61
|
-
platform_name = device.platform.name.
|
62
|
+
platform_name = device.platform.name.tr(' ', '_').downcase
|
62
63
|
uri = [platform_name, index].join(':')
|
63
64
|
|
64
65
|
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
65
66
|
|
66
|
-
OpenclDevice.new(uri, device_type, self).tap do |
|
67
|
-
|
67
|
+
OpenclDevice.new(uri, device_type, self).tap do |devide|
|
68
|
+
devide.native_device = device
|
68
69
|
end
|
69
70
|
end
|
70
71
|
|
@@ -96,8 +97,14 @@ module TensorStream
|
|
96
97
|
end
|
97
98
|
end
|
98
99
|
|
100
|
+
# buffer comes from non-opencl evaluator
|
99
101
|
def convert_from_buffer(tensor, result)
|
100
|
-
|
102
|
+
if result.buffer.is_a?(TensorStream::Evaluator::OutputGroup)
|
103
|
+
converted_outputs = result.buffer.outputs.zip(result.buffer.data_types).map { |output, data_type| convert_to_opencl([output].flatten, shape_eval(output), data_type: data_type, name: tensor.name) }
|
104
|
+
TensorStream::Evaluator::OutputGroup.new(converted_outputs, result.buffer.data_types)
|
105
|
+
else
|
106
|
+
convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
|
107
|
+
end
|
101
108
|
end
|
102
109
|
|
103
110
|
def complete_eval(tensor, context)
|
@@ -106,7 +113,7 @@ module TensorStream
|
|
106
113
|
if buffer.is_a?(Array)
|
107
114
|
buffer = buffer.collect do |b|
|
108
115
|
next b if b.buffer.size.zero?
|
109
|
-
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b
|
116
|
+
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: build_event_wait_list([b]))
|
110
117
|
b
|
111
118
|
end
|
112
119
|
else
|
@@ -114,14 +121,30 @@ module TensorStream
|
|
114
121
|
return buffer if buffer.nil?
|
115
122
|
return [] if buffer.buffer.nil?
|
116
123
|
return buffer if buffer.buffer.size.zero?
|
117
|
-
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer
|
124
|
+
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: build_event_wait_list([buffer]))
|
118
125
|
end
|
119
126
|
_opencl_queue.finish
|
120
127
|
buffer
|
121
128
|
end
|
122
129
|
|
123
|
-
def
|
124
|
-
|
130
|
+
def self.query_devices_with_score
|
131
|
+
OpenCL.platforms.flat_map do |p|
|
132
|
+
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
133
|
+
score = 0
|
134
|
+
if d.type.to_s == 'CPU'
|
135
|
+
score += 1
|
136
|
+
elsif d.type.to_s == 'GPU'
|
137
|
+
score += 4
|
138
|
+
end
|
139
|
+
|
140
|
+
score += 1000 if d.platform.name == 'NVIDIA CUDA'
|
141
|
+
|
142
|
+
score += d.max_compute_units
|
143
|
+
score += d.max_clock_frequency
|
144
|
+
|
145
|
+
[d, score, p.name, index]
|
146
|
+
end
|
147
|
+
end
|
125
148
|
end
|
126
149
|
|
127
150
|
protected
|
@@ -152,31 +175,9 @@ module TensorStream
|
|
152
175
|
@opencl_context = OpenCL.create_context(opencl_device)
|
153
176
|
end
|
154
177
|
|
155
|
-
def self.query_devices_with_score
|
156
|
-
OpenCL.platforms.flat_map do |p|
|
157
|
-
|
158
|
-
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
159
|
-
score = 0
|
160
|
-
if d.type.to_s == 'CPU'
|
161
|
-
score += 1
|
162
|
-
elsif d.type.to_s == 'GPU'
|
163
|
-
score += 4
|
164
|
-
end
|
165
|
-
|
166
|
-
if d.platform.name == 'NVIDIA CUDA'
|
167
|
-
score += 1000
|
168
|
-
end
|
169
|
-
|
170
|
-
score += d.max_compute_units
|
171
|
-
score += d.max_clock_frequency
|
172
|
-
|
173
|
-
[d, score, p.name, index]
|
174
|
-
end
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
178
|
def create_command_queue
|
179
179
|
supported_proprties = opencl_device.queue_properties.names
|
180
|
+
|
180
181
|
properties = []
|
181
182
|
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
182
183
|
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
@@ -196,7 +197,7 @@ module TensorStream
|
|
196
197
|
end
|
197
198
|
|
198
199
|
def _cl_program(kernel, args = {})
|
199
|
-
suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
|
200
|
+
suffix = args.collect { |k, v| "#{k}.#{escape_arg_content(v)}" }.join('.')
|
200
201
|
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
|
201
202
|
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
202
203
|
raise "opencl kernel template for #{kernel} has not yet been defined" if filename.nil?
|
@@ -211,6 +212,13 @@ module TensorStream
|
|
211
212
|
end
|
212
213
|
end
|
213
214
|
|
215
|
+
def escape_arg_content(value)
|
216
|
+
return value.tr(' ','_') if value.is_a?(String)
|
217
|
+
return value.join('-') if value.is_a?(Array)
|
218
|
+
|
219
|
+
value
|
220
|
+
end
|
221
|
+
|
214
222
|
def _run(tensor, execution_context)
|
215
223
|
return tensor if tensor.is_a?(OpenCLBuffer)
|
216
224
|
return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array) && !tensor.size.empty? && tensor[0].is_a?(Tensor)
|
@@ -236,7 +244,7 @@ module TensorStream
|
|
236
244
|
res
|
237
245
|
end
|
238
246
|
|
239
|
-
def eval_variable(tensor,
|
247
|
+
def eval_variable(tensor, _child_context)
|
240
248
|
raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
|
241
249
|
tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
|
242
250
|
tensor.buffer
|
@@ -259,7 +267,10 @@ module TensorStream
|
|
259
267
|
end
|
260
268
|
end
|
261
269
|
|
262
|
-
register_op :identity do |
|
270
|
+
register_op :identity do |context, tensor, inputs|
|
271
|
+
if tensor.inputs.size > 1
|
272
|
+
tensor.inputs[1..inputs.size].each { |input| complete_eval(input, context) }
|
273
|
+
end
|
263
274
|
inputs[0]
|
264
275
|
end
|
265
276
|
|
@@ -277,18 +288,19 @@ module TensorStream
|
|
277
288
|
assign_var(tensor, value, context)
|
278
289
|
end
|
279
290
|
|
291
|
+
register_op :variable, noop: true do |context, tensor, inputs|
|
292
|
+
variable = tensor.inputs[0]
|
293
|
+
raise "variable #{tensor.name} not initalized" if variable.value.nil? && (variable.buffer.nil? || !variable.buffer.dirty)
|
294
|
+
variable.buffer = wrap_opencl(variable, name: variable.name) if variable.buffer.nil?
|
295
|
+
variable.buffer
|
296
|
+
end
|
297
|
+
|
280
298
|
# Fast in place multiply subtract assign
|
281
299
|
register_op :apply_gradient_descent do |_context, tensor, inputs|
|
282
300
|
_target_var, learning_rate, delta = inputs
|
283
301
|
|
284
302
|
assign = tensor.inputs[0] || tensor
|
285
303
|
|
286
|
-
unless assign.buffer
|
287
|
-
value = read_final_result(buffer)
|
288
|
-
assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
|
289
|
-
assign.value = value
|
290
|
-
end
|
291
|
-
|
292
304
|
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
293
305
|
output_buffer = assign.buffer
|
294
306
|
|
@@ -297,13 +309,39 @@ module TensorStream
|
|
297
309
|
cl_m = OpenCL::Int1.new(m || 1)
|
298
310
|
cl_n = OpenCL::Int1.new(n || 1)
|
299
311
|
|
300
|
-
event_wait_list = [assign.buffer
|
312
|
+
event_wait_list = build_event_wait_list([assign.buffer, learning_rate, delta])
|
301
313
|
method_call = :"apply_gradient_#{output_buffer.data_type}"
|
302
314
|
event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
303
315
|
output_buffer.op = event
|
304
316
|
output_buffer
|
305
317
|
end
|
306
318
|
|
319
|
+
# Fast in place multiply subtract assign
|
320
|
+
register_op :apply_momentum do |_context, tensor, inputs|
|
321
|
+
target_var, momentum_var, learning_rate, grad, momentum = inputs
|
322
|
+
|
323
|
+
assign = tensor.inputs[0] || tensor
|
324
|
+
assign_acc = tensor.inputs[1]
|
325
|
+
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
326
|
+
assign_acc.buffer.dirty = true # force buffer copy when variable is read externally
|
327
|
+
|
328
|
+
output_buffer = assign.buffer
|
329
|
+
|
330
|
+
m, n = output_buffer.shape
|
331
|
+
work_group = [m || 1, n || 1]
|
332
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
333
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
334
|
+
|
335
|
+
event_wait_list = build_event_wait_list([assign.buffer, assign_acc.buffer, learning_rate, grad, momentum])
|
336
|
+
method_call = :"apply_momentum_#{output_buffer.data_type}"
|
337
|
+
event = _cl_program("apply_momentum", nesterov: tensor.options[:use_nesterov], dtype: output_buffer.data_type).
|
338
|
+
send(method_call, _opencl_queue, work_group, cl_m, cl_n, grad.cl_buffer,
|
339
|
+
learning_rate.cl_buffer, momentum.cl_buffer, output_buffer.cl_buffer,
|
340
|
+
assign_acc.buffer.cl_buffer, event_wait_list: event_wait_list)
|
341
|
+
output_buffer.op = event
|
342
|
+
output_buffer
|
343
|
+
end
|
344
|
+
|
307
345
|
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
308
346
|
register_op op, noop: true do |context, tensor, inputs|
|
309
347
|
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
|
@@ -332,7 +370,7 @@ module TensorStream
|
|
332
370
|
a = inputs_queue.pop
|
333
371
|
until inputs_queue.empty?
|
334
372
|
b = inputs_queue.pop
|
335
|
-
event_wait_list = [a
|
373
|
+
event_wait_list = build_event_wait_list([a, b])
|
336
374
|
method_call = :"add_#{a.data_type}_#{b.data_type}"
|
337
375
|
event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
338
376
|
a = output_buffer
|
@@ -353,6 +391,23 @@ module TensorStream
|
|
353
391
|
convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
|
354
392
|
end
|
355
393
|
|
394
|
+
register_op :fill, buffer: true do |_context, tensor, inputs|
|
395
|
+
shape = inputs[0]
|
396
|
+
value = inputs[1]
|
397
|
+
|
398
|
+
narray_size = shape.buffer.to_a.reduce(:*) || 1
|
399
|
+
cl_buffer = get_cached_buffer(tensor.name, shape.buffer.to_a)
|
400
|
+
|
401
|
+
buffer = if cl_buffer
|
402
|
+
cl_buffer.buffer
|
403
|
+
else
|
404
|
+
allocate_narray_for_type(tensor.data_type, narray_size)
|
405
|
+
end
|
406
|
+
|
407
|
+
buffer.fill!(value.buffer[0])
|
408
|
+
convert_to_opencl(buffer, shape.buffer.to_a, data_type: tensor.data_type, name: tensor.name)
|
409
|
+
end
|
410
|
+
|
356
411
|
register_op :floor_div, noop: true do |context, tensor, inputs|
|
357
412
|
if fp_type?(tensor.data_type)
|
358
413
|
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
|
@@ -374,8 +429,15 @@ module TensorStream
|
|
374
429
|
v = b.shape[0]
|
375
430
|
k = a.shape[1]
|
376
431
|
|
377
|
-
|
378
|
-
|
432
|
+
if tensor.options[:transpose_a]
|
433
|
+
m = a.shape[1]
|
434
|
+
k = a.shape[0]
|
435
|
+
end
|
436
|
+
|
437
|
+
if tensor.options[:transpose_b]
|
438
|
+
n = b.shape[0]
|
439
|
+
v = b.shape[1]
|
440
|
+
end
|
379
441
|
|
380
442
|
result_shape = [m, n]
|
381
443
|
|
@@ -393,8 +455,8 @@ module TensorStream
|
|
393
455
|
|
394
456
|
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
395
457
|
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
396
|
-
|
397
|
-
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
458
|
+
event_wait_list = build_event_wait_list(inputs)
|
459
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
398
460
|
output_buffer
|
399
461
|
end
|
400
462
|
|
@@ -406,14 +468,47 @@ module TensorStream
|
|
406
468
|
cl_m = OpenCL::Int1.new(m || 1)
|
407
469
|
cl_n = OpenCL::Int1.new(n || 1)
|
408
470
|
work_group = [m || 1, n || 1]
|
409
|
-
|
410
|
-
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
471
|
+
event_wait_list = build_event_wait_list(inputs)
|
472
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
411
473
|
buffer
|
412
474
|
else
|
413
475
|
a
|
414
476
|
end
|
415
477
|
end
|
416
478
|
|
479
|
+
register_op :stack do |_context, tensor, inputs|
|
480
|
+
axis = tensor.options[:axis] || 0
|
481
|
+
shape = inputs[0].shape
|
482
|
+
rank = shape.size + 1
|
483
|
+
elem_size = shape.empty? ? 1 : shape.reduce(:*)
|
484
|
+
|
485
|
+
new_shape = [inputs.size]
|
486
|
+
shape.inject(new_shape) { |ns, s| ns << s }
|
487
|
+
|
488
|
+
divisors = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
489
|
+
a << s * a.last
|
490
|
+
end.reverse
|
491
|
+
|
492
|
+
axis = rank + axis if axis < 0
|
493
|
+
rotated_shape = Array.new(axis + 1) { new_shape.shift }
|
494
|
+
new_shape = rotated_shape.rotate! + new_shape
|
495
|
+
|
496
|
+
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
497
|
+
multipliers = new_shape.dup.drop(1).reverse.inject([1]) do |a, s|
|
498
|
+
a << s * a.last
|
499
|
+
end.reverse
|
500
|
+
|
501
|
+
cl_n = OpenCL::Int1.new(elem_size)
|
502
|
+
work_group = [elem_size]
|
503
|
+
event_wait_list = build_event_wait_list(inputs)
|
504
|
+
ops = inputs.each_with_index.map do |input, index|
|
505
|
+
cl_index = OpenCL::Int1.new(index)
|
506
|
+
_cl_program("pack", data_type: tensor.data_type, divisors: divisors, multipliers: multipliers, axis: axis).pack(_opencl_queue, work_group, cl_n, cl_index, input.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
507
|
+
end
|
508
|
+
output_buffer.op = ops
|
509
|
+
output_buffer
|
510
|
+
end
|
511
|
+
|
417
512
|
%i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
|
418
513
|
register_op op, noop: true do |context, tensor, inputs|
|
419
514
|
execute_func(op.to_s, tensor, inputs[0], context)
|
@@ -422,7 +517,7 @@ module TensorStream
|
|
422
517
|
|
423
518
|
register_op :softmax do |_context, tensor, inputs|
|
424
519
|
a = inputs[0]
|
425
|
-
event_wait_list =
|
520
|
+
event_wait_list = build_event_wait_list(inputs)
|
426
521
|
dtype = tensor.data_type
|
427
522
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
428
523
|
|
@@ -438,7 +533,7 @@ module TensorStream
|
|
438
533
|
|
439
534
|
register_op :log_softmax do |_context, tensor, inputs|
|
440
535
|
a = inputs[0] # logits
|
441
|
-
event_wait_list =
|
536
|
+
event_wait_list = build_event_wait_list(inputs)
|
442
537
|
dtype = tensor.data_type
|
443
538
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
444
539
|
|
@@ -452,28 +547,33 @@ module TensorStream
|
|
452
547
|
output_buffer
|
453
548
|
end
|
454
549
|
|
455
|
-
register_op :softmax_cross_entropy_with_logits_v2 do |
|
550
|
+
register_op :softmax_cross_entropy_with_logits_v2 do |context, tensor, inputs|
|
456
551
|
a = inputs[0] # logits
|
457
552
|
b = inputs[1] # labels
|
458
|
-
event_wait_list =
|
553
|
+
event_wait_list = build_event_wait_list(inputs)
|
459
554
|
dtype = tensor.data_type
|
460
555
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
461
|
-
|
556
|
+
output_buffer_backprop = _create_result_buffer(tensor.data_type, a.shape, "#{tensor.name}_2")
|
557
|
+
rank = a.shape.size - 1
|
462
558
|
m, n = a.shape
|
463
559
|
work_group = [m]
|
464
560
|
n = m if n.nil?
|
465
561
|
cl_n = OpenCL::Int1.new(n || 1)
|
466
562
|
|
467
|
-
event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
|
563
|
+
event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer,
|
564
|
+
output_buffer.cl_buffer, output_buffer_backprop.cl_buffer, event_wait_list: event_wait_list)
|
468
565
|
output_buffer.op = event
|
469
|
-
|
566
|
+
output_buffer_backprop.op = event
|
567
|
+
|
568
|
+
loss = reduction(context, tensor, output_buffer, rank, :sum)
|
569
|
+
OutputGroup.new([loss, output_buffer_backprop], [tensor.inputs[0].data_type, tensor.inputs[0].data_type])
|
470
570
|
end
|
471
571
|
|
472
572
|
register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
|
473
573
|
a = inputs[0] # logits
|
474
574
|
b = inputs[1] # labels
|
475
575
|
c = inputs[2] # grads
|
476
|
-
event_wait_list =
|
576
|
+
event_wait_list = build_event_wait_list(inputs)
|
477
577
|
dtype = tensor.data_type
|
478
578
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
479
579
|
|
@@ -490,7 +590,7 @@ module TensorStream
|
|
490
590
|
register_op :softmax_grad do |_context, tensor, inputs|
|
491
591
|
a, grad = inputs
|
492
592
|
|
493
|
-
event_wait_list =
|
593
|
+
event_wait_list = build_event_wait_list(inputs)
|
494
594
|
dtype = tensor.data_type
|
495
595
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
496
596
|
|
@@ -508,7 +608,7 @@ module TensorStream
|
|
508
608
|
name = tensor.options[:name]
|
509
609
|
|
510
610
|
a.buffer.each do |input|
|
511
|
-
raise "#{name} Invalid Argument" if input.nan? || input.infinite?
|
611
|
+
raise TensorStream::InvalidArgumentError, "#{name} Invalid Argument" if input.nan? || input.infinite?
|
512
612
|
end
|
513
613
|
a
|
514
614
|
end
|
@@ -522,8 +622,8 @@ module TensorStream
|
|
522
622
|
input_a = read_final_result(complete_eval(a, context))
|
523
623
|
input_b = read_final_result(complete_eval(b, context))
|
524
624
|
b_a, b_b = broadcast(input_a, input_b)
|
525
|
-
[
|
526
|
-
|
625
|
+
[wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
|
626
|
+
wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
|
527
627
|
end
|
528
628
|
end
|
529
629
|
|
@@ -557,8 +657,22 @@ module TensorStream
|
|
557
657
|
|
558
658
|
register_op :transpose, buffer: true do |_context, tensor, inputs|
|
559
659
|
t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
|
560
|
-
|
561
|
-
|
660
|
+
|
661
|
+
if inputs[0].shape.size == 2 && inputs[1].nil?
|
662
|
+
transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
|
663
|
+
res = convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
664
|
+
res
|
665
|
+
else
|
666
|
+
rank = inputs[0].shape.size
|
667
|
+
perm = inputs[1].nil? ? (0...rank).to_a.reverse : inputs[1].buffer
|
668
|
+
new_shape = perm.map { |p| inputs[0].shape[p] }.to_a
|
669
|
+
output_buffer = _create_result_buffer(tensor.data_type, new_shape, tensor.name)
|
670
|
+
transpose_with_perm(inputs[0].buffer, output_buffer.buffer, inputs[0].shape, new_shape, perm)
|
671
|
+
|
672
|
+
write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
673
|
+
output_buffer.op = write_op
|
674
|
+
output_buffer
|
675
|
+
end
|
562
676
|
end
|
563
677
|
|
564
678
|
register_op :index, noop: true do |context, tensor, inputs|
|
@@ -567,39 +681,36 @@ module TensorStream
|
|
567
681
|
|
568
682
|
if a.is_a?(OutputGroup)
|
569
683
|
a.outputs[index]
|
684
|
+
elsif a.is_a?(Array)
|
685
|
+
a[index]
|
570
686
|
else
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
new_shape.shift
|
576
|
-
input_a = read_final_result(a)
|
577
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
578
|
-
end
|
687
|
+
new_shape = a.shape.dup
|
688
|
+
new_shape.shift
|
689
|
+
input_a = read_final_result(a)
|
690
|
+
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
579
691
|
end
|
580
692
|
end
|
581
693
|
|
582
694
|
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
583
695
|
rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
|
584
|
-
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name:
|
696
|
+
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: tensor.name), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")], tensor.inputs.map(&:data_type))
|
585
697
|
end
|
586
698
|
|
587
699
|
register_op :shape do |_context, tensor, inputs|
|
588
700
|
wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.data_type)
|
589
701
|
end
|
590
702
|
|
591
|
-
register_op :reshape, buffer: true do |_context,
|
703
|
+
register_op :reshape, buffer: true do |_context, tensor, inputs|
|
592
704
|
arr = inputs[0]
|
593
705
|
new_shape = read_final_result(inputs[1])
|
594
706
|
|
595
|
-
if new_shape.size.zero? && arr.buffer.size == 1
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
end
|
707
|
+
shape = if new_shape.size.zero? && arr.buffer.size == 1
|
708
|
+
new_shape
|
709
|
+
else
|
710
|
+
TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
711
|
+
end
|
712
|
+
|
713
|
+
convert_to_opencl(arr.buffer, shape, data_type: arr.data_type, name: tensor.name)
|
603
714
|
end
|
604
715
|
|
605
716
|
register_op :flow_group do |_context, _tensor, inputs|
|
@@ -618,6 +729,7 @@ module TensorStream
|
|
618
729
|
|
619
730
|
register_op :prod, noop: true do |context, tensor, inputs|
|
620
731
|
input_a = complete_eval(inputs[0], context)
|
732
|
+
|
621
733
|
if input_a.buffer.empty?
|
622
734
|
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
623
735
|
else
|
@@ -646,13 +758,11 @@ module TensorStream
|
|
646
758
|
end
|
647
759
|
|
648
760
|
def eval_operation(tensor, child_context)
|
649
|
-
|
650
761
|
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
651
762
|
return @context[:_cache][cache_key] if @context[:_cache].key?(cache_key)
|
652
763
|
return @context[cache_key] if @context.key?(cache_key)
|
653
|
-
|
764
|
+
# puts "opencl: #{tensor.name}"
|
654
765
|
invoke(tensor, child_context).tap do |result|
|
655
|
-
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
656
766
|
if tensor.breakpoint
|
657
767
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
658
768
|
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
@@ -676,9 +786,11 @@ module TensorStream
|
|
676
786
|
@context[:_cache][cache_key] = result if tensor.is_const
|
677
787
|
end
|
678
788
|
rescue EvaluatorExcecutionException => e
|
679
|
-
|
789
|
+
_opencl_queue.finish # dump queue
|
790
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
|
680
791
|
rescue TensorStreamError => e
|
681
|
-
|
792
|
+
_opencl_queue.finish # dump queue
|
793
|
+
raise e, "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
|
682
794
|
rescue StandardError => e
|
683
795
|
_opencl_queue.finish # dump queue
|
684
796
|
puts e.message
|
@@ -698,7 +810,7 @@ module TensorStream
|
|
698
810
|
# File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
|
699
811
|
|
700
812
|
# File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
|
701
|
-
raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
|
813
|
+
raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true, 1)} defined at #{tensor.source}"
|
702
814
|
end
|
703
815
|
|
704
816
|
def eval_tensor(tensor, child_context)
|
@@ -724,8 +836,9 @@ module TensorStream
|
|
724
836
|
|
725
837
|
if assign.buffer
|
726
838
|
# buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
839
|
+
event_wait_list = build_event_wait_list([buffer, assign.buffer])
|
727
840
|
assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
|
728
|
-
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list:
|
841
|
+
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: event_wait_list)
|
729
842
|
else
|
730
843
|
buffer.op
|
731
844
|
end
|
@@ -745,7 +858,6 @@ module TensorStream
|
|
745
858
|
dtype = tensor.data_type
|
746
859
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
747
860
|
return _create_result_buffer(dtype, [0], "out_#{tensor.name}") if result_shape == [0]
|
748
|
-
|
749
861
|
output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
|
750
862
|
a, b, prog, switch_operands = select_program(a, b, op_name)
|
751
863
|
m, n = result_shape
|
@@ -754,21 +866,26 @@ module TensorStream
|
|
754
866
|
cl_n = OpenCL::Int1.new(n || 1)
|
755
867
|
cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
|
756
868
|
|
757
|
-
event_wait_list = [a
|
869
|
+
event_wait_list = build_event_wait_list([a, b]) # add dependency wait list
|
758
870
|
|
759
871
|
method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
|
872
|
+
prog_name ||= op_name
|
760
873
|
event = if prog == "#{op_name}_b"
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
874
|
+
cl_m_b, cl_n_b = if b.shape.size == 2
|
875
|
+
[OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
|
876
|
+
elsif b.shape.size == 1
|
877
|
+
[OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
|
878
|
+
else
|
879
|
+
raise "rank > 2 not supported!"
|
880
|
+
end
|
881
|
+
_cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
|
882
|
+
send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b,
|
883
|
+
cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
884
|
+
else
|
885
|
+
_cl_program(prog_name, a: a.data_type, b: b.data_type, dtype: dtype).
|
886
|
+
send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch,
|
887
|
+
a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
888
|
+
end
|
772
889
|
|
773
890
|
output_buffer.op = event
|
774
891
|
output_buffer
|
@@ -789,14 +906,14 @@ module TensorStream
|
|
789
906
|
cl_m = OpenCL::Int1.new(m || 1)
|
790
907
|
cl_n = OpenCL::Int1.new(n || 1)
|
791
908
|
|
792
|
-
event_wait_list = [a
|
909
|
+
event_wait_list = build_event_wait_list([a, b, p]) # add dependency wait list
|
793
910
|
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
794
911
|
output_buffer
|
795
912
|
end
|
796
913
|
|
797
914
|
def execute_func(op_name, tensor, a, child_context)
|
798
915
|
a = _run(a, child_context)
|
799
|
-
event_wait_list = [a
|
916
|
+
event_wait_list = build_event_wait_list([a])
|
800
917
|
dtype = tensor.data_type
|
801
918
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
802
919
|
|
@@ -814,7 +931,7 @@ module TensorStream
|
|
814
931
|
return [a, b] if a.data_type == b.data_type
|
815
932
|
m, n = b.shape
|
816
933
|
work_group = [m || 1, n || 1]
|
817
|
-
event_wait_list = [b
|
934
|
+
event_wait_list = build_event_wait_list([b])
|
818
935
|
buffer = _create_result_buffer(b.data_type, b.shape, name)
|
819
936
|
|
820
937
|
cl_m = OpenCL::Int1.new(m || 1)
|
@@ -848,6 +965,11 @@ module TensorStream
|
|
848
965
|
convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
|
849
966
|
end
|
850
967
|
|
968
|
+
def get_cached_buffer(name, shape)
|
969
|
+
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
970
|
+
@context[:_cache][cache_key]
|
971
|
+
end
|
972
|
+
|
851
973
|
def convert_to_opencl(value, shape, data_type: nil, name: nil)
|
852
974
|
value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
|
853
975
|
|
@@ -863,6 +985,8 @@ module TensorStream
|
|
863
985
|
allocate_narray_for_type(data_type, narray_size)
|
864
986
|
end
|
865
987
|
|
988
|
+
return nil if buffer.nil?
|
989
|
+
|
866
990
|
cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
|
867
991
|
|
868
992
|
cl_buffer = unless value.flatten.empty?
|
@@ -908,18 +1032,20 @@ module TensorStream
|
|
908
1032
|
NArray.sint(narray_size)
|
909
1033
|
when :boolean
|
910
1034
|
NArray.sint(narray_size)
|
1035
|
+
when :unknown
|
1036
|
+
nil
|
911
1037
|
else
|
912
1038
|
raise "unsupported type #{data_type}"
|
913
1039
|
end
|
914
1040
|
end
|
915
1041
|
|
916
1042
|
def _create_result_buffer(data_type, shape, name)
|
917
|
-
return OpenCLBuffer.new(data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
1043
|
+
return OpenCLBuffer.new(name: name, data_type: data_type, shape: [0], buffer: nil, cl_buffer: nil) if shape == [0]
|
918
1044
|
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
|
919
1045
|
size = shape.empty? || shape == [0] ? 1 : shape.reduce(:*)
|
920
1046
|
buffer = allocate_narray_for_type(data_type, size)
|
921
1047
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
922
|
-
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
1048
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer, name: name)
|
923
1049
|
end
|
924
1050
|
end
|
925
1051
|
|
@@ -969,7 +1095,7 @@ module TensorStream
|
|
969
1095
|
|
970
1096
|
def reduction(child_context, tensor, a, b, func)
|
971
1097
|
input = complete_eval(a, child_context)
|
972
|
-
axis = read_final_result(complete_eval(b, child_context))
|
1098
|
+
axis = b.is_a?(Tensor) ? read_final_result(complete_eval(b, child_context)) : b
|
973
1099
|
if axis.nil?
|
974
1100
|
red = input.buffer.send(func)
|
975
1101
|
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
@@ -1021,6 +1147,10 @@ module TensorStream
|
|
1021
1147
|
shape.is_a?(Array) ? shape.size : 0
|
1022
1148
|
end
|
1023
1149
|
|
1150
|
+
def build_event_wait_list(inputs)
|
1151
|
+
inputs.compact.map(&:op).flatten
|
1152
|
+
end
|
1153
|
+
|
1024
1154
|
def resolve_placeholder(placeholder, _execution_context = {})
|
1025
1155
|
return nil if placeholder.nil?
|
1026
1156
|
|