tensor_stream 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +38 -17
- data/benchmark/benchmark.rb +16 -20
- data/lib/tensor_stream/control_flow.rb +3 -3
- data/lib/tensor_stream/debugging/debugging.rb +4 -4
- data/lib/tensor_stream/device.rb +5 -2
- data/lib/tensor_stream/evaluator/base_evaluator.rb +138 -0
- data/lib/tensor_stream/evaluator/buffer.rb +7 -2
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_bool_operand.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_operand.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/abs.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/add.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmax.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmin.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cast.cl +0 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +6 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cos.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/div.cl.erb +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/exp.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/gemm.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log1p.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/max.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/mul.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/negate.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/pow.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/reciprocal.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/round.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid_grad.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sign.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sin.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax_grad.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sqrt.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/square.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sub.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tan.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh_grad.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/where.cl +1 -1
- data/lib/tensor_stream/evaluator/{opencl_buffer.rb → opencl/opencl_buffer.rb} +1 -1
- data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +5 -0
- data/lib/tensor_stream/evaluator/{opencl_evaluator.rb → opencl/opencl_evaluator.rb} +404 -452
- data/lib/tensor_stream/evaluator/{opencl_template_helper.rb → opencl/opencl_template_helper.rb} +6 -6
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +21 -21
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +492 -398
- data/lib/tensor_stream/graph.rb +21 -1
- data/lib/tensor_stream/graph_serializers/graphml.rb +59 -59
- data/lib/tensor_stream/graph_serializers/pbtext.rb +1 -1
- data/lib/tensor_stream/helpers/op_helper.rb +6 -2
- data/lib/tensor_stream/math_gradients.rb +7 -7
- data/lib/tensor_stream/operation.rb +100 -100
- data/lib/tensor_stream/session.rb +81 -8
- data/lib/tensor_stream/tensor.rb +7 -5
- data/lib/tensor_stream/utils.rb +32 -19
- data/lib/tensor_stream/version.rb +1 -1
- data/tensor_stream.gemspec +0 -1
- data/test_samples/raw_neural_net_sample.rb +7 -7
- metadata +41 -53
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +0 -5
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('add')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,6 @@
|
|
1
|
+
% ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
+
% a_dtype = dtype_to_c_type(a)
|
3
|
+
% b_dtype = dtype_to_c_type(b)
|
4
|
+
% op = operator_to_c(fname)
|
5
|
+
<%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
|
6
|
+
% end
|
File without changes
|
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('div')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
// same dimension add floating point op
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void max_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
__kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -9,7 +9,7 @@
|
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
12
|
-
__kernel void max_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
|
+
__kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -22,7 +22,7 @@
|
|
22
22
|
}
|
23
23
|
|
24
24
|
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void max_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
25
|
+
__kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
26
|
// Get the index of the current element to be processed
|
27
27
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
28
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('mul')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
// same dimension add floating point op
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void pow_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
__kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -9,7 +9,7 @@
|
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
12
|
-
__kernel void pow_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
|
+
__kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -22,7 +22,7 @@
|
|
22
22
|
}
|
23
23
|
|
24
24
|
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void pow_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
25
|
+
__kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
26
|
// Get the index of the current element to be processed
|
27
27
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
28
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
File without changes
|
File without changes
|
File without changes
|
@@ -9,7 +9,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
|
9
9
|
}
|
10
10
|
|
11
11
|
// same dimension add floating point op
|
12
|
-
__kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
|
+
__kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -18,7 +18,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
|
18
18
|
}
|
19
19
|
|
20
20
|
// 1D + Scalar floating point add op
|
21
|
-
__kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
21
|
+
__kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
22
22
|
// Get the index of the current element to be processed
|
23
23
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
24
24
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -31,7 +31,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
|
31
31
|
}
|
32
32
|
|
33
33
|
// 1D + Scalar floating point add op broadcast
|
34
|
-
__kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
34
|
+
__kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
35
35
|
// Get the index of the current element to be processed
|
36
36
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
37
37
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -5,7 +5,7 @@ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_d
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
8
|
-
% if
|
8
|
+
% if floating_point?(dtype)
|
9
9
|
if (isnan(value) || value == 0.0f) {
|
10
10
|
C[globalRow * N + globalCol] = 0.0;
|
11
11
|
} else {
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('sub')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,5 +1,5 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void where_<%= dtype %>(const int M, const int N, __global const
|
2
|
+
__kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -2,7 +2,7 @@ module TensorStream
|
|
2
2
|
class OpenCLBuffer < Buffer
|
3
3
|
include ArrayOpsHelper
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :shape, :buffer, :cl_buffer, :op
|
6
6
|
|
7
7
|
def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
8
8
|
@data_type = data_type
|
@@ -1,11 +1,12 @@
|
|
1
1
|
require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
|
2
2
|
require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
|
3
3
|
require 'tensor_stream/evaluator/operation_helpers/math_helper'
|
4
|
-
require 'tensor_stream/evaluator/opencl_buffer'
|
5
|
-
require 'tensor_stream/evaluator/opencl_template_helper'
|
6
|
-
require '
|
4
|
+
require 'tensor_stream/evaluator/opencl/opencl_buffer'
|
5
|
+
require 'tensor_stream/evaluator/opencl/opencl_template_helper'
|
6
|
+
require 'tensor_stream/evaluator/opencl/opencl_device'
|
7
7
|
require 'opencl_ruby_ffi'
|
8
8
|
require 'narray_ffi'
|
9
|
+
require 'tensor_stream/evaluator/base_evaluator'
|
9
10
|
|
10
11
|
module TensorStream
|
11
12
|
module Evaluator
|
@@ -27,31 +28,78 @@ module TensorStream
|
|
27
28
|
end
|
28
29
|
|
29
30
|
## PURE ruby evaluator used for testing and development
|
30
|
-
class OpenclEvaluator
|
31
|
+
class OpenclEvaluator < BaseEvaluator
|
31
32
|
attr_accessor :retain
|
32
33
|
|
33
34
|
include TensorStream::OpHelper
|
34
35
|
include TensorStream::ArrayOpsHelper
|
35
36
|
include TensorStream::MathHelper
|
36
37
|
|
37
|
-
def initialize(session,
|
38
|
-
|
39
|
-
|
40
|
-
@
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
def initialize(session, device, thread_pool: nil, log_intermediates: false)
|
39
|
+
super
|
40
|
+
_create_opencl_context(device.native_device)
|
41
|
+
@opencl_device = device.native_device
|
42
|
+
create_command_queue
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.query_supported_devices
|
46
|
+
devices = query_devices_with_score
|
47
|
+
devices.sort { |a| a[1] }.reverse.map do |d|
|
48
|
+
opencl_to_device(d)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.fetch_device(query = [])
|
53
|
+
devices = query_devices_with_score
|
54
|
+
platform_devices = devices.select { |d| d[0].platform.to_s.downcase =~ /#{query[0].downcase}/ }
|
55
|
+
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.opencl_to_device(d)
|
59
|
+
device = d[0]
|
60
|
+
index = d[3]
|
61
|
+
platform_name = device.platform.name.gsub(' ', '_').downcase
|
62
|
+
uri = [platform_name, index].join(':')
|
63
|
+
|
64
|
+
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
65
|
+
|
66
|
+
OpenclDevice.new(uri, device_type, self).tap do |d|
|
67
|
+
d.native_device = device
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Select the best device available in the system for this evaluator
|
73
|
+
def self.default_device
|
74
|
+
devices = OpenclEvaluator.query_devices_with_score
|
75
|
+
device = devices.sort { |a| a[1] }.reverse.first
|
76
|
+
opencl_to_device(device)
|
46
77
|
end
|
47
78
|
|
48
79
|
# opencl evaluator main entrypoint
|
49
80
|
def run(tensor, execution_context)
|
50
|
-
_create_opencl_context
|
51
|
-
create_command_queue
|
52
81
|
read_final_result(complete_eval(tensor, execution_context))
|
53
82
|
end
|
54
83
|
|
84
|
+
def run_with_buffer(tensor, context, execution_context)
|
85
|
+
@context = context
|
86
|
+
@context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
|
87
|
+
|
88
|
+
if tensor.is_a?(Array)
|
89
|
+
tensor.collect do |t|
|
90
|
+
value = run(t, execution_context)
|
91
|
+
Buffer.new(data_type: t.data_type, buffer: value)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
value = run(tensor, execution_context)
|
95
|
+
Buffer.new(data_type: tensor.data_type, buffer: value)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def convert_from_buffer(tensor, result)
|
100
|
+
convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
|
101
|
+
end
|
102
|
+
|
55
103
|
def complete_eval(tensor, context)
|
56
104
|
buffer = _run(tensor, context)
|
57
105
|
if buffer.is_a?(Array)
|
@@ -69,11 +117,25 @@ module TensorStream
|
|
69
117
|
end
|
70
118
|
|
71
119
|
def opencl_device
|
72
|
-
@
|
120
|
+
@opencl_device
|
73
121
|
end
|
74
122
|
|
75
123
|
protected
|
76
124
|
|
125
|
+
def prepare_input(tensor, context, options = {})
|
126
|
+
return nil unless tensor
|
127
|
+
tensor = resolve_placeholder(tensor)
|
128
|
+
if options[:noop]
|
129
|
+
tensor
|
130
|
+
elsif options[:buffer]
|
131
|
+
complete_eval(tensor, context)
|
132
|
+
elsif options[:complete]
|
133
|
+
read_final_result(complete_eval(tensor, context))
|
134
|
+
else
|
135
|
+
_run(tensor, context)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
77
139
|
# read result from opencl and convert to ruby
|
78
140
|
def read_final_result(buffer)
|
79
141
|
return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
|
@@ -82,43 +144,37 @@ module TensorStream
|
|
82
144
|
buffer.to_ruby
|
83
145
|
end
|
84
146
|
|
85
|
-
def _create_opencl_context
|
86
|
-
@
|
87
|
-
if @preferred_device
|
88
|
-
@preferred_device
|
89
|
-
else
|
90
|
-
device, _score, _platform, _index = choose_best_device
|
91
|
-
# puts "using #{device.name}"
|
92
|
-
device
|
93
|
-
end
|
94
|
-
end
|
95
|
-
@context[:cl_device] = opencl_device
|
96
|
-
@context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
|
147
|
+
def _create_opencl_context(opencl_device)
|
148
|
+
@opencl_context = OpenCL.create_context(opencl_device)
|
97
149
|
end
|
98
150
|
|
99
151
|
def choose_best_device
|
100
152
|
@best_device ||= begin
|
101
|
-
devices =
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
if d.type.to_s == 'CPU'
|
106
|
-
score += 1
|
107
|
-
elsif d.type.to_s == 'GPU'
|
108
|
-
score += 4
|
109
|
-
end
|
153
|
+
devices = OpenclEvaluator.query_devices_with_score
|
154
|
+
devices.sort { |a| a[1] }.reverse.first
|
155
|
+
end
|
156
|
+
end
|
110
157
|
|
111
|
-
|
112
|
-
|
113
|
-
end
|
158
|
+
def self.query_devices_with_score
|
159
|
+
OpenCL.platforms.flat_map do |p|
|
114
160
|
|
115
|
-
|
116
|
-
|
161
|
+
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
162
|
+
score = 0
|
163
|
+
if d.type.to_s == 'CPU'
|
164
|
+
score += 1
|
165
|
+
elsif d.type.to_s == 'GPU'
|
166
|
+
score += 4
|
167
|
+
end
|
117
168
|
|
118
|
-
|
169
|
+
if d.platform.name == 'NVIDIA CUDA'
|
170
|
+
score += 1000
|
119
171
|
end
|
172
|
+
|
173
|
+
score += d.max_compute_units
|
174
|
+
score += d.max_clock_frequency
|
175
|
+
|
176
|
+
[d, score, p.name, index]
|
120
177
|
end
|
121
|
-
devices.sort { |a| a[1] }.reverse.first
|
122
178
|
end
|
123
179
|
end
|
124
180
|
|
@@ -127,15 +183,15 @@ module TensorStream
|
|
127
183
|
properties = []
|
128
184
|
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
129
185
|
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
130
|
-
@
|
186
|
+
@command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
|
131
187
|
end
|
132
188
|
|
133
189
|
def _opencl_context
|
134
|
-
@
|
190
|
+
@opencl_context
|
135
191
|
end
|
136
192
|
|
137
193
|
def _opencl_queue
|
138
|
-
@
|
194
|
+
@command_queue
|
139
195
|
end
|
140
196
|
|
141
197
|
def cl_template_path(kernel, extension)
|
@@ -144,7 +200,7 @@ module TensorStream
|
|
144
200
|
|
145
201
|
def _cl_program(kernel, args = {})
|
146
202
|
suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
|
147
|
-
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
|
203
|
+
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
|
148
204
|
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
149
205
|
source = File.read(filename)
|
150
206
|
source = OpenclTemplateHelper.new(source).generate(args)
|
@@ -163,13 +219,16 @@ module TensorStream
|
|
163
219
|
return tensor.map { |t| _run(t, execution_context) }
|
164
220
|
end
|
165
221
|
|
166
|
-
return tensor if retain.include?(tensor) # if var is in retain don't eval to value
|
167
|
-
|
168
222
|
tensor = tensor.call if tensor.is_a?(Proc)
|
169
223
|
|
170
224
|
child_context = execution_context.dup
|
171
225
|
res = if tensor.is_a?(Operation)
|
172
|
-
|
226
|
+
if !self.class.ops.include?(tensor.operation.to_sym)
|
227
|
+
result = @session.delegate_to_evaluator(tensor, @context, execution_context)
|
228
|
+
convert_from_buffer(tensor, result)
|
229
|
+
else
|
230
|
+
eval_operation(tensor, child_context)
|
231
|
+
end
|
173
232
|
elsif tensor.is_a?(Variable)
|
174
233
|
eval_variable(tensor, child_context)
|
175
234
|
elsif tensor.is_a?(Placeholder)
|
@@ -187,415 +246,306 @@ module TensorStream
|
|
187
246
|
tensor.buffer
|
188
247
|
end
|
189
248
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
return @context[cache_key] if @context.key?(cache_key)
|
194
|
-
a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
|
195
|
-
b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
|
196
|
-
# puts tensor.name
|
197
|
-
case tensor.operation
|
198
|
-
when :concat
|
199
|
-
input_a = read_final_result(complete_eval(a, child_context))
|
200
|
-
arr = concat_array(input_a, tensor.options[:axis])
|
201
|
-
convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
|
202
|
-
when :cond
|
203
|
-
pred = complete_eval(tensor.options[:pred], child_context)
|
204
|
-
a = _run(a, child_context)
|
205
|
-
b = _run(b, child_context)
|
206
|
-
|
207
|
-
if all_true?(pred.buffer)
|
208
|
-
a
|
209
|
-
else
|
210
|
-
b
|
211
|
-
end
|
212
|
-
when :identity
|
213
|
-
_run(a, child_context)
|
214
|
-
when :eye
|
215
|
-
rows = complete_eval(a, child_context)
|
216
|
-
columns = complete_eval(b, child_context)
|
217
|
-
shape = [rows.buffer[0], columns.buffer[0]]
|
218
|
-
eye_arr = Array.new(rows.buffer[0]) do |i|
|
219
|
-
Array.new(columns.buffer[0]) do |col|
|
220
|
-
if fp_type?(tensor.data_type)
|
221
|
-
i == col ? 1.0 : 0.0
|
222
|
-
else
|
223
|
-
i == col ? 1 : 0
|
224
|
-
end
|
225
|
-
end
|
226
|
-
end
|
249
|
+
register_op :log do |context, tensor, inputs|
|
250
|
+
execute_func('log', tensor, inputs[0], context)
|
251
|
+
end
|
227
252
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
|
232
|
-
|
233
|
-
padding = arr_pad(a, p, tensor.data_type)
|
234
|
-
convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
|
235
|
-
when :tile
|
236
|
-
input = read_final_result(complete_eval(a, child_context))
|
237
|
-
multiples = read_final_result(complete_eval(b, child_context))
|
238
|
-
|
239
|
-
rank = get_rank(input)
|
240
|
-
raise '1D or higher tensor required' if rank.zero?
|
241
|
-
raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
|
242
|
-
|
243
|
-
tile = tile_arr(input, 0, multiples)
|
244
|
-
arr = tile.nil? ? [] : tile
|
245
|
-
convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
|
246
|
-
when :assign
|
247
|
-
assign_var(tensor, b, child_context)
|
248
|
-
when :assign_add
|
249
|
-
a = _run(a, child_context)
|
250
|
-
b = _run(b, child_context)
|
251
|
-
value = execute_2_operand_func('add', tensor, a, b, child_context)
|
252
|
-
assign_var(tensor, value, child_context)
|
253
|
-
when :assign_sub
|
254
|
-
a = _run(a, child_context)
|
255
|
-
b = _run(b, child_context)
|
256
|
-
|
257
|
-
value = execute_2_operand_func('sub', tensor, a, b, child_context)
|
258
|
-
assign_var(tensor, value, child_context)
|
259
|
-
when :less
|
260
|
-
execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
|
261
|
-
when :less_equal
|
262
|
-
execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
|
263
|
-
when :greater
|
264
|
-
execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
|
265
|
-
when :greater_equal
|
266
|
-
execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
|
267
|
-
when :equal
|
268
|
-
execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
|
269
|
-
when :not_equal
|
270
|
-
execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
|
271
|
-
when :logical_and
|
272
|
-
execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
|
273
|
-
when :where
|
274
|
-
pred = tensor.options[:pred]
|
275
|
-
execute_cond_func('where', tensor, pred, a, b, child_context)
|
276
|
-
when :max
|
277
|
-
execute_2_operand_func('max', tensor, a, b, child_context)
|
278
|
-
when :add
|
279
|
-
execute_2_operand_func('add', tensor, a, b, child_context)
|
280
|
-
when :div
|
281
|
-
execute_2_operand_func('div', tensor, a, b, child_context)
|
282
|
-
when :sub
|
283
|
-
execute_2_operand_func('sub', tensor, a, b, child_context)
|
284
|
-
when :matmul
|
285
|
-
a = _run(a, child_context)
|
286
|
-
b = _run(b, child_context)
|
287
|
-
|
288
|
-
m = a.shape[0]
|
289
|
-
n = b.shape[1]
|
290
|
-
v = b.shape[0]
|
291
|
-
k = a.shape[1]
|
292
|
-
|
293
|
-
m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
|
294
|
-
n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
|
295
|
-
|
296
|
-
result_shape = [m, n]
|
297
|
-
|
298
|
-
raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
|
299
|
-
raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
|
300
|
-
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
301
|
-
|
302
|
-
dtype = tensor.data_type
|
303
|
-
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
304
|
-
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
305
|
-
|
306
|
-
cl_m = OpenCL::Int1.new(m)
|
307
|
-
cl_n = OpenCL::Int1.new(n)
|
308
|
-
cl_k = OpenCL::Int1.new(k)
|
309
|
-
|
310
|
-
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
311
|
-
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
312
|
-
|
313
|
-
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
314
|
-
output_buffer
|
315
|
-
when :mul
|
316
|
-
execute_2_operand_func('mul', tensor, a, b, child_context)
|
317
|
-
when :pow
|
318
|
-
execute_2_operand_func('pow', tensor, a, b, child_context)
|
319
|
-
when :cast
|
320
|
-
a = _run(a, child_context)
|
321
|
-
if a.data_type != tensor.data_type
|
322
|
-
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
323
|
-
m, n = a.shape
|
324
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
325
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
326
|
-
work_group = [m || 1, n || 1]
|
327
|
-
|
328
|
-
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
329
|
-
buffer
|
330
|
-
else
|
331
|
-
a
|
332
|
-
end
|
333
|
-
when :sign
|
334
|
-
execute_func('sign', tensor, a, child_context)
|
335
|
-
when :exp
|
336
|
-
execute_func('exp', tensor, a, child_context)
|
337
|
-
when :log
|
338
|
-
execute_func('log', tensor, a, child_context)
|
339
|
-
when :sin
|
340
|
-
execute_func('sin', tensor, a, child_context)
|
341
|
-
when :tan
|
342
|
-
execute_func('tan', tensor, a, child_context)
|
343
|
-
when :cos
|
344
|
-
execute_func('cos', tensor, a, child_context)
|
345
|
-
when :abs
|
346
|
-
execute_func('abs', tensor, a, child_context)
|
347
|
-
when :sqrt
|
348
|
-
execute_func('sqrt', tensor, a, child_context)
|
349
|
-
when :negate
|
350
|
-
execute_func('negate', tensor, a, child_context)
|
351
|
-
when :square
|
352
|
-
execute_func('square', tensor, a, child_context)
|
353
|
-
when :reciprocal
|
354
|
-
execute_func('reciprocal', tensor, a, child_context)
|
355
|
-
when :tanh
|
356
|
-
execute_func('tanh', tensor, a, child_context)
|
357
|
-
when :tanh_grad
|
358
|
-
execute_func('tanh_grad', tensor, a, child_context)
|
359
|
-
when :sigmoid
|
360
|
-
execute_func('sigmoid', tensor, a, child_context)
|
361
|
-
when :log1p
|
362
|
-
execute_func('log1p', tensor, a, child_context)
|
363
|
-
when :round
|
364
|
-
execute_func('round', tensor, a, child_context)
|
365
|
-
when :softmax
|
366
|
-
a = _run(a, child_context)
|
367
|
-
event_wait_list = [a.op].compact
|
368
|
-
dtype = tensor.data_type
|
369
|
-
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
253
|
+
register_op :sin do |context, tensor, inputs|
|
254
|
+
execute_func('sin', tensor, inputs[0], context)
|
255
|
+
end
|
370
256
|
|
371
|
-
|
372
|
-
|
373
|
-
n = m if n.nil?
|
374
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
257
|
+
register_op :cond do |context, tensor, inputs|
|
258
|
+
pred = complete_eval(tensor.options[:pred], context)
|
375
259
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
260
|
+
if all_true?(pred.buffer)
|
261
|
+
inputs[0]
|
262
|
+
else
|
263
|
+
inputs[1]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
register_op :identity do |_context, _tensor, inputs|
|
268
|
+
inputs[0]
|
269
|
+
end
|
270
|
+
|
271
|
+
register_op :assign, noop: true do |context, tensor, inputs|
|
272
|
+
assign_var(tensor, inputs[1], context)
|
273
|
+
end
|
274
|
+
|
275
|
+
register_op :assign_add do |context, tensor, inputs|
|
276
|
+
value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
|
277
|
+
assign_var(tensor, value, context)
|
278
|
+
end
|
279
|
+
|
280
|
+
register_op :assign_sub do |context, tensor, inputs|
|
281
|
+
value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
|
282
|
+
assign_var(tensor, value, context)
|
283
|
+
end
|
284
|
+
|
285
|
+
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
286
|
+
register_op op, noop: true do |context, tensor, inputs|
|
287
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
|
288
|
+
end
|
289
|
+
end
|
385
290
|
|
291
|
+
%i[max add div sub mul pow sigmoid_grad].each do |op|
|
292
|
+
register_op op, noop: true do |context, tensor, inputs|
|
293
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
register_op :where, noop: true do |context, tensor, inputs|
|
298
|
+
pred = tensor.options[:pred]
|
299
|
+
execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
|
300
|
+
end
|
301
|
+
|
302
|
+
register_op :matmul do |_context, tensor, inputs|
|
303
|
+
a, b = inputs
|
304
|
+
|
305
|
+
m = a.shape[0]
|
306
|
+
n = b.shape[1]
|
307
|
+
v = b.shape[0]
|
308
|
+
k = a.shape[1]
|
309
|
+
|
310
|
+
m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
|
311
|
+
n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
|
312
|
+
|
313
|
+
result_shape = [m, n]
|
314
|
+
|
315
|
+
raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
|
316
|
+
raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
|
317
|
+
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
318
|
+
|
319
|
+
dtype = tensor.data_type
|
320
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
321
|
+
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
322
|
+
|
323
|
+
cl_m = OpenCL::Int1.new(m)
|
324
|
+
cl_n = OpenCL::Int1.new(n)
|
325
|
+
cl_k = OpenCL::Int1.new(k)
|
326
|
+
|
327
|
+
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
328
|
+
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
329
|
+
|
330
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
331
|
+
output_buffer
|
332
|
+
end
|
333
|
+
|
334
|
+
register_op :cast do |_context, tensor, inputs|
|
335
|
+
a = inputs[0]
|
336
|
+
if a.data_type != tensor.data_type
|
337
|
+
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
386
338
|
m, n = a.shape
|
387
|
-
|
388
|
-
n = m if n.nil?
|
339
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
389
340
|
cl_n = OpenCL::Int1.new(n || 1)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
341
|
+
work_group = [m || 1, n || 1]
|
342
|
+
|
343
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
344
|
+
buffer
|
345
|
+
else
|
346
|
+
a
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
%i[sign exp tan cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round].each do |op|
|
351
|
+
register_op op, noop: true do |context, tensor, inputs|
|
352
|
+
execute_func(op.to_s, tensor, inputs[0], context)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
register_op :softmax do |_context, tensor, inputs|
|
357
|
+
a = inputs[0]
|
358
|
+
event_wait_list = [a.op].compact
|
359
|
+
dtype = tensor.data_type
|
360
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
361
|
+
|
362
|
+
m, n = a.shape
|
363
|
+
work_group = [m]
|
364
|
+
n = m if n.nil?
|
365
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
366
|
+
|
367
|
+
event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
368
|
+
output_buffer.op = event
|
369
|
+
output_buffer
|
370
|
+
end
|
371
|
+
|
372
|
+
register_op :softmax_grad do |_context, tensor, inputs|
|
373
|
+
a, grad = inputs
|
374
|
+
|
375
|
+
event_wait_list = [a.op].compact
|
376
|
+
dtype = tensor.data_type
|
377
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
378
|
+
|
379
|
+
m, n = a.shape
|
380
|
+
work_group = [m]
|
381
|
+
n = m if n.nil?
|
382
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
383
|
+
event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
384
|
+
output_buffer.op = event
|
385
|
+
output_buffer
|
386
|
+
end
|
387
|
+
|
388
|
+
register_op :truncate do |context, tensor, inputs|
|
389
|
+
a, b = inputs
|
390
|
+
if a.shape.size.zero?
|
391
|
+
a
|
392
|
+
else
|
393
|
+
input_b = read_final_result(b)
|
394
|
+
if a.shape == input_b
|
400
395
|
a
|
401
396
|
else
|
402
|
-
|
403
|
-
if
|
404
|
-
a
|
405
|
-
|
406
|
-
|
407
|
-
if input_b == []
|
408
|
-
if a.buffer.size == 1
|
409
|
-
a.shape = input_b
|
410
|
-
a
|
411
|
-
else
|
412
|
-
wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
|
413
|
-
end
|
397
|
+
input_a = read_final_result(a)
|
398
|
+
if input_b == []
|
399
|
+
if a.buffer.size == 1
|
400
|
+
a.shape = input_b
|
401
|
+
a
|
414
402
|
else
|
415
|
-
wrap_opencl(
|
403
|
+
wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
|
416
404
|
end
|
405
|
+
else
|
406
|
+
wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
|
417
407
|
end
|
418
408
|
end
|
419
|
-
|
420
|
-
|
421
|
-
name = tensor.options[:name]
|
409
|
+
end
|
410
|
+
end
|
422
411
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
a
|
427
|
-
when :zeros, :ones, :zeros_like, :ones_like
|
428
|
-
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
429
|
-
_run(a, child_context).shape
|
430
|
-
else
|
431
|
-
read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
|
432
|
-
end
|
412
|
+
register_op :check_numerics, noop: true do |context, tensor, inputs|
|
413
|
+
a = complete_eval(inputs[0], context)
|
414
|
+
name = tensor.options[:name]
|
433
415
|
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
416
|
+
a.buffer.each do |input|
|
417
|
+
raise "#{name} Invalid Argument" if input.nan? || input.infinite?
|
418
|
+
end
|
419
|
+
a
|
420
|
+
end
|
439
421
|
|
440
|
-
|
422
|
+
register_op :broadcast_transform do |context, tensor, inputs|
|
423
|
+
a, b = inputs
|
441
424
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
func.call
|
453
|
-
end
|
454
|
-
else
|
455
|
-
func.call
|
456
|
-
end
|
425
|
+
if a.shape == b.shape
|
426
|
+
[a, b]
|
427
|
+
else
|
428
|
+
input_a = read_final_result(complete_eval(a, context))
|
429
|
+
input_b = read_final_result(complete_eval(b, context))
|
430
|
+
b_a, b_b = broadcast(input_a, input_b)
|
431
|
+
[ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
|
432
|
+
wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
|
433
|
+
end
|
434
|
+
end
|
457
435
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
when :index
|
500
|
-
a = complete_eval(a, child_context)
|
501
|
-
input_a = read_final_result(a)
|
502
|
-
index = read_final_result(complete_eval(b, child_context))
|
503
|
-
|
504
|
-
if a.is_a?(Array)
|
505
|
-
a[index]
|
506
|
-
else
|
507
|
-
new_shape = a.shape.dup
|
508
|
-
new_shape.shift
|
509
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
510
|
-
end
|
511
|
-
when :broadcast_gradient_args
|
512
|
-
a = complete_eval(a, child_context)
|
513
|
-
b = complete_eval(b, child_context)
|
514
|
-
|
515
|
-
wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
|
516
|
-
when :shape
|
517
|
-
a = _run(a, child_context)
|
518
|
-
|
519
|
-
wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
|
520
|
-
when :reshape
|
521
|
-
arr = complete_eval(a, child_context)
|
522
|
-
new_shape = read_final_result(complete_eval(b, child_context))
|
523
|
-
|
524
|
-
if new_shape.size.zero? && arr.buffer.size == 1
|
525
|
-
arr.shape = new_shape
|
526
|
-
arr
|
527
|
-
else
|
528
|
-
new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
529
|
-
arr.shape = new_shape
|
530
|
-
arr
|
531
|
-
end
|
532
|
-
when :random_uniform
|
533
|
-
maxval = tensor.options.fetch(:maxval, 1)
|
534
|
-
minval = tensor.options.fetch(:minval, 0)
|
535
|
-
seed = tensor.options[:seed]
|
536
|
-
|
537
|
-
random = _get_randomizer(tensor, seed)
|
538
|
-
generator = -> { random.rand * (maxval - minval) + minval }
|
539
|
-
shape = tensor.options[:shape] || tensor.shape.shape
|
540
|
-
|
541
|
-
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
542
|
-
when :random_normal
|
543
|
-
random = _get_randomizer(tensor, seed)
|
544
|
-
r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
|
545
|
-
random = _get_randomizer(tensor, seed)
|
546
|
-
generator = -> { r.rand }
|
547
|
-
shape = tensor.options[:shape] || tensor.shape.shape
|
548
|
-
|
549
|
-
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
550
|
-
when :glorot_uniform
|
551
|
-
random = _get_randomizer(tensor, seed)
|
552
|
-
|
553
|
-
shape = tensor.options[:shape] || tensor.shape.shape
|
554
|
-
fan_in, fan_out = if shape.size.zero?
|
555
|
-
[1, 1]
|
556
|
-
elsif shape.size == 1
|
557
|
-
[1, shape[0]]
|
558
|
-
else
|
559
|
-
[shape[0], shape.last]
|
560
|
-
end
|
561
|
-
|
562
|
-
limit = Math.sqrt(6.0 / (fan_in + fan_out))
|
563
|
-
|
564
|
-
minval = -limit
|
565
|
-
maxval = limit
|
566
|
-
|
567
|
-
generator = -> { random.rand * (maxval - minval) + minval }
|
568
|
-
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
569
|
-
when :flow_group
|
570
|
-
tensor.items.collect { |item| _run(item, child_context) }
|
571
|
-
when :sum
|
572
|
-
reduction(child_context, tensor, a, b, :sum)
|
573
|
-
when :mean
|
574
|
-
reduction(child_context, tensor, a, b, :mean)
|
575
|
-
when :prod
|
576
|
-
input_a = complete_eval(a, child_context)
|
577
|
-
if input_a.buffer.empty?
|
578
|
-
convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
|
579
|
-
else
|
580
|
-
reduction(child_context, tensor, a, b, :prod)
|
581
|
-
end
|
582
|
-
when :argmin
|
583
|
-
a = complete_eval(a, child_context)
|
584
|
-
axis = tensor.options[:axis] || 0
|
585
|
-
arr = a.buffer.reshape(*a.shape.reverse).to_a
|
586
|
-
op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
|
587
|
-
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
588
|
-
when :argmax
|
589
|
-
a = complete_eval(a, child_context)
|
590
|
-
axis = tensor.options[:axis] || 0
|
591
|
-
arr = a.buffer.reshape(*a.shape.reverse).to_a
|
592
|
-
op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
|
593
|
-
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
436
|
+
register_op :print do |context, tensor, inputs|
|
437
|
+
a, b = inputs
|
438
|
+
input_b = complete_eval(b, context)
|
439
|
+
input_b = read_final_result(input_b)
|
440
|
+
puts "#{tensor.options.fetch(:message, '')} #{input_b}"
|
441
|
+
a
|
442
|
+
end
|
443
|
+
|
444
|
+
register_op :rank do |_context, tensor, inputs|
|
445
|
+
wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
|
446
|
+
end
|
447
|
+
|
448
|
+
register_op :stop_gradient do |_context, _tensor, inputs|
|
449
|
+
inputs[0]
|
450
|
+
end
|
451
|
+
|
452
|
+
register_op :slice, noop: true do |context, tensor, inputs|
|
453
|
+
input_a = complete_eval(inputs[0], context)
|
454
|
+
input_b = read_final_result(complete_eval(inputs[1], context))
|
455
|
+
size = tensor.options[:size]
|
456
|
+
|
457
|
+
slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
|
458
|
+
|
459
|
+
new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
|
460
|
+
sliced = new_buf.slice[*slice_param]
|
461
|
+
convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
462
|
+
end
|
463
|
+
|
464
|
+
register_op :transpose, buffer: true do |_context, tensor, inputs|
|
465
|
+
t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
|
466
|
+
transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
|
467
|
+
convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
468
|
+
end
|
469
|
+
|
470
|
+
register_op :index, buffer: true do |_context, tensor, inputs|
|
471
|
+
a = inputs[0]
|
472
|
+
input_a = read_final_result(a)
|
473
|
+
index = read_final_result(inputs[1])
|
474
|
+
|
475
|
+
if a.is_a?(Array)
|
476
|
+
a[index]
|
594
477
|
else
|
595
|
-
|
596
|
-
|
478
|
+
new_shape = a.shape.dup
|
479
|
+
new_shape.shift
|
480
|
+
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
481
|
+
end
|
482
|
+
end
|
483
|
+
|
484
|
+
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
485
|
+
wrap_opencl(get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a), data_type: inputs[0].data_type, name: tensor.name)
|
486
|
+
end
|
487
|
+
|
488
|
+
register_op :shape do |_context, tensor, inputs|
|
489
|
+
wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
|
490
|
+
end
|
491
|
+
|
492
|
+
register_op :reshape, buffer: true do |_context, _tensor, inputs|
|
493
|
+
arr = inputs[0]
|
494
|
+
new_shape = read_final_result(inputs[1])
|
495
|
+
|
496
|
+
if new_shape.size.zero? && arr.buffer.size == 1
|
497
|
+
arr.shape = new_shape
|
498
|
+
arr
|
499
|
+
else
|
500
|
+
new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
501
|
+
arr.shape = new_shape
|
502
|
+
arr
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
register_op :flow_group do |_context, _tensor, inputs|
|
507
|
+
inputs
|
508
|
+
end
|
509
|
+
|
510
|
+
%i[sum mean].each do |op|
|
511
|
+
register_op op, noop: true do |context, tensor, inputs|
|
512
|
+
reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
|
513
|
+
end
|
514
|
+
end
|
515
|
+
|
516
|
+
register_op :prod, noop: true do |context, tensor, inputs|
|
517
|
+
input_a = complete_eval(inputs[0], context)
|
518
|
+
if input_a.buffer.empty?
|
519
|
+
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
520
|
+
else
|
521
|
+
reduction(context, tensor, inputs[0], inputs[1], :prod)
|
522
|
+
end
|
523
|
+
end
|
524
|
+
|
525
|
+
register_op :argmin, buffer: true do |_context, tensor, inputs|
|
526
|
+
axis = tensor.options[:axis] || 0
|
527
|
+
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
528
|
+
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
529
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
530
|
+
end
|
531
|
+
|
532
|
+
register_op :argmax, buffer: true do |_context, tensor, inputs|
|
533
|
+
axis = tensor.options[:axis] || 0
|
534
|
+
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
535
|
+
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
|
536
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
537
|
+
end
|
538
|
+
|
539
|
+
def eval_operation(tensor, child_context)
|
540
|
+
return @context[tensor.name] if @context.key?(tensor.name)
|
541
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
542
|
+
return @context[cache_key] if @context.key?(cache_key)
|
543
|
+
# puts tensor.name
|
544
|
+
invoke(tensor, child_context).tap do |result|
|
597
545
|
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
598
546
|
if tensor.breakpoint
|
547
|
+
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
548
|
+
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
599
549
|
a = read_final_result(complete_eval(a, child_context))
|
600
550
|
b = read_final_result(complete_eval(b, child_context))
|
601
551
|
result = read_final_result(complete_eval(result, child_context))
|
@@ -642,7 +592,7 @@ module TensorStream
|
|
642
592
|
def eval_tensor(tensor, child_context)
|
643
593
|
return tensor unless tensor.is_a?(Tensor)
|
644
594
|
|
645
|
-
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
|
595
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
646
596
|
return @context[cache_key] if @context.key?(cache_key)
|
647
597
|
return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
|
648
598
|
@context[cache_key] = if tensor.value.is_a?(Tensor)
|
@@ -656,7 +606,7 @@ module TensorStream
|
|
656
606
|
private
|
657
607
|
|
658
608
|
def assign_var(tensor, b, child_context)
|
659
|
-
assign = tensor.
|
609
|
+
assign = tensor.inputs[0] || tensor
|
660
610
|
buffer = complete_eval(b, child_context)
|
661
611
|
|
662
612
|
if assign.buffer
|
@@ -678,7 +628,7 @@ module TensorStream
|
|
678
628
|
dtype = tensor.data_type
|
679
629
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
680
630
|
|
681
|
-
output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
|
631
|
+
output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
|
682
632
|
a, b, prog, switch_operands = select_program(a, b, op_name)
|
683
633
|
m, n = result_shape
|
684
634
|
work_group = [m || 1, n || 1]
|
@@ -688,6 +638,7 @@ module TensorStream
|
|
688
638
|
|
689
639
|
event_wait_list = [a.op, b.op].compact # add dependency wait list
|
690
640
|
|
641
|
+
method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
|
691
642
|
event = if prog == "#{op_name}_b"
|
692
643
|
cl_m_b, cl_n_b = if b.shape.size == 2
|
693
644
|
[ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
|
@@ -696,9 +647,9 @@ module TensorStream
|
|
696
647
|
else
|
697
648
|
raise "rank > 2 not supported!"
|
698
649
|
end
|
699
|
-
_cl_program("#{prog_name || op_name}", dtype: dtype).send(
|
650
|
+
_cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
700
651
|
else
|
701
|
-
_cl_program("#{prog_name || op_name}", dtype: dtype).send(
|
652
|
+
_cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
702
653
|
end
|
703
654
|
|
704
655
|
output_buffer.op = event
|
@@ -784,7 +735,7 @@ module TensorStream
|
|
784
735
|
value = [value]
|
785
736
|
end
|
786
737
|
|
787
|
-
cache_key = "_cl_object_#{name}
|
738
|
+
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
788
739
|
cl_object = if name && @context[:_cache][cache_key]
|
789
740
|
@context[:_cache][cache_key]
|
790
741
|
else
|
@@ -813,13 +764,13 @@ module TensorStream
|
|
813
764
|
if element.is_a?(Tensor)
|
814
765
|
cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
|
815
766
|
else
|
816
|
-
cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
|
767
|
+
cl_object.buffer[index] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(element, data_type))
|
817
768
|
end
|
818
769
|
end
|
819
770
|
elsif value.is_a?(NArray)
|
820
771
|
cl_object.buffer = value
|
821
772
|
else
|
822
|
-
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
773
|
+
cl_object.buffer[0] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(value, data_type))
|
823
774
|
end
|
824
775
|
|
825
776
|
write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
@@ -840,14 +791,14 @@ module TensorStream
|
|
840
791
|
when :int16
|
841
792
|
NArray.sint(narray_size)
|
842
793
|
when :boolean
|
843
|
-
NArray.
|
794
|
+
NArray.sint(narray_size)
|
844
795
|
else
|
845
796
|
raise "unsupported type #{data_type}"
|
846
797
|
end
|
847
798
|
end
|
848
799
|
|
849
800
|
def _create_result_buffer(data_type, shape, name)
|
850
|
-
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
801
|
+
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
|
851
802
|
size = shape.empty? ? 1 : shape.reduce(:*)
|
852
803
|
buffer = allocate_narray_for_type(data_type, size)
|
853
804
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
@@ -1029,7 +980,6 @@ module TensorStream
|
|
1029
980
|
|
1030
981
|
def resolve_placeholder(placeholder, _execution_context = {})
|
1031
982
|
return nil if placeholder.nil?
|
1032
|
-
return placeholder if retain.include?(placeholder)
|
1033
983
|
|
1034
984
|
var = if placeholder.is_a?(Placeholder)
|
1035
985
|
@context[placeholder.name.to_sym].tap do |c|
|
@@ -1056,7 +1006,7 @@ module TensorStream
|
|
1056
1006
|
reduced_val = r[0]
|
1057
1007
|
if r.size > 1
|
1058
1008
|
reduced_val = f.call(r[0..val.size])
|
1059
|
-
elsif r.size
|
1009
|
+
elsif r.size.zero?
|
1060
1010
|
reduced_val = f.call(nil)
|
1061
1011
|
end
|
1062
1012
|
keep_dims ? [ reduced_val ] : reduced_val
|
@@ -1143,3 +1093,5 @@ module TensorStream
|
|
1143
1093
|
end
|
1144
1094
|
end
|
1145
1095
|
end
|
1096
|
+
|
1097
|
+
TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)
|