tensor_stream 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +38 -17
- data/benchmark/benchmark.rb +16 -20
- data/lib/tensor_stream/control_flow.rb +3 -3
- data/lib/tensor_stream/debugging/debugging.rb +4 -4
- data/lib/tensor_stream/device.rb +5 -2
- data/lib/tensor_stream/evaluator/base_evaluator.rb +138 -0
- data/lib/tensor_stream/evaluator/buffer.rb +7 -2
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_bool_operand.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/_operand.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/abs.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/add.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmax.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/argmin.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cast.cl +0 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/cond.cl.erb +6 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/cos.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/div.cl.erb +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/exp.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/gemm.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/log1p.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/max.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/mul.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/negate.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/pow.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/reciprocal.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/round.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sigmoid_grad.cl +3 -3
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sign.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sin.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/softmax_grad.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sqrt.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/square.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/sub.cl +1 -1
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tan.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/tanh_grad.cl +0 -0
- data/lib/tensor_stream/evaluator/{kernels → opencl/kernels}/where.cl +1 -1
- data/lib/tensor_stream/evaluator/{opencl_buffer.rb → opencl/opencl_buffer.rb} +1 -1
- data/lib/tensor_stream/evaluator/opencl/opencl_device.rb +5 -0
- data/lib/tensor_stream/evaluator/{opencl_evaluator.rb → opencl/opencl_evaluator.rb} +404 -452
- data/lib/tensor_stream/evaluator/{opencl_template_helper.rb → opencl/opencl_template_helper.rb} +6 -6
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +21 -21
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +492 -398
- data/lib/tensor_stream/graph.rb +21 -1
- data/lib/tensor_stream/graph_serializers/graphml.rb +59 -59
- data/lib/tensor_stream/graph_serializers/pbtext.rb +1 -1
- data/lib/tensor_stream/helpers/op_helper.rb +6 -2
- data/lib/tensor_stream/math_gradients.rb +7 -7
- data/lib/tensor_stream/operation.rb +100 -100
- data/lib/tensor_stream/session.rb +81 -8
- data/lib/tensor_stream/tensor.rb +7 -5
- data/lib/tensor_stream/utils.rb +32 -19
- data/lib/tensor_stream/version.rb +1 -1
- data/tensor_stream.gemspec +0 -1
- data/test_samples/raw_neural_net_sample.rb +7 -7
- metadata +41 -53
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +0 -5
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('add')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,6 @@
|
|
1
|
+
% ["#{dtype}"].product(%w[less less_equal equal not_equal greater greater_equal logical_and]).each do |dtype, fname|
|
2
|
+
% a_dtype = dtype_to_c_type(a)
|
3
|
+
% b_dtype = dtype_to_c_type(b)
|
4
|
+
% op = operator_to_c(fname)
|
5
|
+
<%= render 'bool_operand.cl', a_dtype: a_dtype, b_dtype: b_dtype, op: op, fname: fname, dtype: "#{a}_#{b}", result_t: 'short' %>
|
6
|
+
% end
|
File without changes
|
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('div')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'div', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
// same dimension add floating point op
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void max_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
__kernel void max_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -9,7 +9,7 @@
|
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
12
|
-
__kernel void max_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
|
+
__kernel void max_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -22,7 +22,7 @@
|
|
22
22
|
}
|
23
23
|
|
24
24
|
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void max_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
25
|
+
__kernel void max_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
26
|
// Get the index of the current element to be processed
|
27
27
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
28
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('mul')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'mul', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
// same dimension add floating point op
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void pow_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
|
+
__kernel void pow_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -9,7 +9,7 @@
|
|
9
9
|
}
|
10
10
|
|
11
11
|
// 1D + Scalar floating point add op
|
12
|
-
__kernel void pow_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
|
+
__kernel void pow_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -22,7 +22,7 @@
|
|
22
22
|
}
|
23
23
|
|
24
24
|
// 1D + Scalar floating point add op broadcast
|
25
|
-
__kernel void pow_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
25
|
+
__kernel void pow_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
26
|
// Get the index of the current element to be processed
|
27
27
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
28
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
File without changes
|
File without changes
|
File without changes
|
@@ -9,7 +9,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
|
9
9
|
}
|
10
10
|
|
11
11
|
// same dimension add floating point op
|
12
|
-
__kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
|
+
__kernel void sigmoid_grad_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
13
|
// Get the index of the current element to be processed
|
14
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -18,7 +18,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
|
18
18
|
}
|
19
19
|
|
20
20
|
// 1D + Scalar floating point add op
|
21
|
-
__kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
21
|
+
__kernel void sigmoid_grad_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
22
22
|
// Get the index of the current element to be processed
|
23
23
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
24
24
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -31,7 +31,7 @@ float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
|
31
31
|
}
|
32
32
|
|
33
33
|
// 1D + Scalar floating point add op broadcast
|
34
|
-
__kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
34
|
+
__kernel void sigmoid_grad_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
35
35
|
// Get the index of the current element to be processed
|
36
36
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
37
37
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -5,7 +5,7 @@ __kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_d
|
|
5
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
7
|
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
8
|
-
% if
|
8
|
+
% if floating_point?(dtype)
|
9
9
|
if (isnan(value) || value == 0.0f) {
|
10
10
|
C[globalRow * N + globalCol] = 0.0;
|
11
11
|
} else {
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,3 +1,3 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% op = operator_to_c('sub')
|
3
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype:
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: "#{a}_#{b}", result_t: c_dtype %>
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,5 +1,5 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void where_<%= dtype %>(const int M, const int N, __global const
|
2
|
+
__kernel void where_<%= dtype %>(const int M, const int N, __global const short *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -2,7 +2,7 @@ module TensorStream
|
|
2
2
|
class OpenCLBuffer < Buffer
|
3
3
|
include ArrayOpsHelper
|
4
4
|
|
5
|
-
attr_accessor :
|
5
|
+
attr_accessor :shape, :buffer, :cl_buffer, :op
|
6
6
|
|
7
7
|
def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
8
8
|
@data_type = data_type
|
@@ -1,11 +1,12 @@
|
|
1
1
|
require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
|
2
2
|
require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
|
3
3
|
require 'tensor_stream/evaluator/operation_helpers/math_helper'
|
4
|
-
require 'tensor_stream/evaluator/opencl_buffer'
|
5
|
-
require 'tensor_stream/evaluator/opencl_template_helper'
|
6
|
-
require '
|
4
|
+
require 'tensor_stream/evaluator/opencl/opencl_buffer'
|
5
|
+
require 'tensor_stream/evaluator/opencl/opencl_template_helper'
|
6
|
+
require 'tensor_stream/evaluator/opencl/opencl_device'
|
7
7
|
require 'opencl_ruby_ffi'
|
8
8
|
require 'narray_ffi'
|
9
|
+
require 'tensor_stream/evaluator/base_evaluator'
|
9
10
|
|
10
11
|
module TensorStream
|
11
12
|
module Evaluator
|
@@ -27,31 +28,78 @@ module TensorStream
|
|
27
28
|
end
|
28
29
|
|
29
30
|
## PURE ruby evaluator used for testing and development
|
30
|
-
class OpenclEvaluator
|
31
|
+
class OpenclEvaluator < BaseEvaluator
|
31
32
|
attr_accessor :retain
|
32
33
|
|
33
34
|
include TensorStream::OpHelper
|
34
35
|
include TensorStream::ArrayOpsHelper
|
35
36
|
include TensorStream::MathHelper
|
36
37
|
|
37
|
-
def initialize(session,
|
38
|
-
|
39
|
-
|
40
|
-
@
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
def initialize(session, device, thread_pool: nil, log_intermediates: false)
|
39
|
+
super
|
40
|
+
_create_opencl_context(device.native_device)
|
41
|
+
@opencl_device = device.native_device
|
42
|
+
create_command_queue
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.query_supported_devices
|
46
|
+
devices = query_devices_with_score
|
47
|
+
devices.sort { |a| a[1] }.reverse.map do |d|
|
48
|
+
opencl_to_device(d)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.fetch_device(query = [])
|
53
|
+
devices = query_devices_with_score
|
54
|
+
platform_devices = devices.select { |d| d[0].platform.to_s.downcase =~ /#{query[0].downcase}/ }
|
55
|
+
opencl_to_device(platform_devices[[query[1].to_i, platform_devices.size - 1].min])
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.opencl_to_device(d)
|
59
|
+
device = d[0]
|
60
|
+
index = d[3]
|
61
|
+
platform_name = device.platform.name.gsub(' ', '_').downcase
|
62
|
+
uri = [platform_name, index].join(':')
|
63
|
+
|
64
|
+
device_type = device.type.to_s == 'GPU' ? :gpu : :cpu
|
65
|
+
|
66
|
+
OpenclDevice.new(uri, device_type, self).tap do |d|
|
67
|
+
d.native_device = device
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Select the best device available in the system for this evaluator
|
73
|
+
def self.default_device
|
74
|
+
devices = OpenclEvaluator.query_devices_with_score
|
75
|
+
device = devices.sort { |a| a[1] }.reverse.first
|
76
|
+
opencl_to_device(device)
|
46
77
|
end
|
47
78
|
|
48
79
|
# opencl evaluator main entrypoint
|
49
80
|
def run(tensor, execution_context)
|
50
|
-
_create_opencl_context
|
51
|
-
create_command_queue
|
52
81
|
read_final_result(complete_eval(tensor, execution_context))
|
53
82
|
end
|
54
83
|
|
84
|
+
def run_with_buffer(tensor, context, execution_context)
|
85
|
+
@context = context
|
86
|
+
@context[:_cache][:_cl_buffers] ||= {} if context[:_cache]
|
87
|
+
|
88
|
+
if tensor.is_a?(Array)
|
89
|
+
tensor.collect do |t|
|
90
|
+
value = run(t, execution_context)
|
91
|
+
Buffer.new(data_type: t.data_type, buffer: value)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
value = run(tensor, execution_context)
|
95
|
+
Buffer.new(data_type: tensor.data_type, buffer: value)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def convert_from_buffer(tensor, result)
|
100
|
+
convert_to_opencl([result.buffer].flatten, shape_eval(result.buffer), data_type: result.data_type, name: tensor.name)
|
101
|
+
end
|
102
|
+
|
55
103
|
def complete_eval(tensor, context)
|
56
104
|
buffer = _run(tensor, context)
|
57
105
|
if buffer.is_a?(Array)
|
@@ -69,11 +117,25 @@ module TensorStream
|
|
69
117
|
end
|
70
118
|
|
71
119
|
def opencl_device
|
72
|
-
@
|
120
|
+
@opencl_device
|
73
121
|
end
|
74
122
|
|
75
123
|
protected
|
76
124
|
|
125
|
+
def prepare_input(tensor, context, options = {})
|
126
|
+
return nil unless tensor
|
127
|
+
tensor = resolve_placeholder(tensor)
|
128
|
+
if options[:noop]
|
129
|
+
tensor
|
130
|
+
elsif options[:buffer]
|
131
|
+
complete_eval(tensor, context)
|
132
|
+
elsif options[:complete]
|
133
|
+
read_final_result(complete_eval(tensor, context))
|
134
|
+
else
|
135
|
+
_run(tensor, context)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
77
139
|
# read result from opencl and convert to ruby
|
78
140
|
def read_final_result(buffer)
|
79
141
|
return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
|
@@ -82,43 +144,37 @@ module TensorStream
|
|
82
144
|
buffer.to_ruby
|
83
145
|
end
|
84
146
|
|
85
|
-
def _create_opencl_context
|
86
|
-
@
|
87
|
-
if @preferred_device
|
88
|
-
@preferred_device
|
89
|
-
else
|
90
|
-
device, _score, _platform, _index = choose_best_device
|
91
|
-
# puts "using #{device.name}"
|
92
|
-
device
|
93
|
-
end
|
94
|
-
end
|
95
|
-
@context[:cl_device] = opencl_device
|
96
|
-
@context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
|
147
|
+
def _create_opencl_context(opencl_device)
|
148
|
+
@opencl_context = OpenCL.create_context(opencl_device)
|
97
149
|
end
|
98
150
|
|
99
151
|
def choose_best_device
|
100
152
|
@best_device ||= begin
|
101
|
-
devices =
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
if d.type.to_s == 'CPU'
|
106
|
-
score += 1
|
107
|
-
elsif d.type.to_s == 'GPU'
|
108
|
-
score += 4
|
109
|
-
end
|
153
|
+
devices = OpenclEvaluator.query_devices_with_score
|
154
|
+
devices.sort { |a| a[1] }.reverse.first
|
155
|
+
end
|
156
|
+
end
|
110
157
|
|
111
|
-
|
112
|
-
|
113
|
-
end
|
158
|
+
def self.query_devices_with_score
|
159
|
+
OpenCL.platforms.flat_map do |p|
|
114
160
|
|
115
|
-
|
116
|
-
|
161
|
+
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
162
|
+
score = 0
|
163
|
+
if d.type.to_s == 'CPU'
|
164
|
+
score += 1
|
165
|
+
elsif d.type.to_s == 'GPU'
|
166
|
+
score += 4
|
167
|
+
end
|
117
168
|
|
118
|
-
|
169
|
+
if d.platform.name == 'NVIDIA CUDA'
|
170
|
+
score += 1000
|
119
171
|
end
|
172
|
+
|
173
|
+
score += d.max_compute_units
|
174
|
+
score += d.max_clock_frequency
|
175
|
+
|
176
|
+
[d, score, p.name, index]
|
120
177
|
end
|
121
|
-
devices.sort { |a| a[1] }.reverse.first
|
122
178
|
end
|
123
179
|
end
|
124
180
|
|
@@ -127,15 +183,15 @@ module TensorStream
|
|
127
183
|
properties = []
|
128
184
|
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
129
185
|
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
130
|
-
@
|
186
|
+
@command_queue = _opencl_context.create_command_queue(opencl_device, properties: properties)
|
131
187
|
end
|
132
188
|
|
133
189
|
def _opencl_context
|
134
|
-
@
|
190
|
+
@opencl_context
|
135
191
|
end
|
136
192
|
|
137
193
|
def _opencl_queue
|
138
|
-
@
|
194
|
+
@command_queue
|
139
195
|
end
|
140
196
|
|
141
197
|
def cl_template_path(kernel, extension)
|
@@ -144,7 +200,7 @@ module TensorStream
|
|
144
200
|
|
145
201
|
def _cl_program(kernel, args = {})
|
146
202
|
suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
|
147
|
-
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
|
203
|
+
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}:#{object_id}"] ||= begin
|
148
204
|
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
149
205
|
source = File.read(filename)
|
150
206
|
source = OpenclTemplateHelper.new(source).generate(args)
|
@@ -163,13 +219,16 @@ module TensorStream
|
|
163
219
|
return tensor.map { |t| _run(t, execution_context) }
|
164
220
|
end
|
165
221
|
|
166
|
-
return tensor if retain.include?(tensor) # if var is in retain don't eval to value
|
167
|
-
|
168
222
|
tensor = tensor.call if tensor.is_a?(Proc)
|
169
223
|
|
170
224
|
child_context = execution_context.dup
|
171
225
|
res = if tensor.is_a?(Operation)
|
172
|
-
|
226
|
+
if !self.class.ops.include?(tensor.operation.to_sym)
|
227
|
+
result = @session.delegate_to_evaluator(tensor, @context, execution_context)
|
228
|
+
convert_from_buffer(tensor, result)
|
229
|
+
else
|
230
|
+
eval_operation(tensor, child_context)
|
231
|
+
end
|
173
232
|
elsif tensor.is_a?(Variable)
|
174
233
|
eval_variable(tensor, child_context)
|
175
234
|
elsif tensor.is_a?(Placeholder)
|
@@ -187,415 +246,306 @@ module TensorStream
|
|
187
246
|
tensor.buffer
|
188
247
|
end
|
189
248
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
return @context[cache_key] if @context.key?(cache_key)
|
194
|
-
a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
|
195
|
-
b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
|
196
|
-
# puts tensor.name
|
197
|
-
case tensor.operation
|
198
|
-
when :concat
|
199
|
-
input_a = read_final_result(complete_eval(a, child_context))
|
200
|
-
arr = concat_array(input_a, tensor.options[:axis])
|
201
|
-
convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
|
202
|
-
when :cond
|
203
|
-
pred = complete_eval(tensor.options[:pred], child_context)
|
204
|
-
a = _run(a, child_context)
|
205
|
-
b = _run(b, child_context)
|
206
|
-
|
207
|
-
if all_true?(pred.buffer)
|
208
|
-
a
|
209
|
-
else
|
210
|
-
b
|
211
|
-
end
|
212
|
-
when :identity
|
213
|
-
_run(a, child_context)
|
214
|
-
when :eye
|
215
|
-
rows = complete_eval(a, child_context)
|
216
|
-
columns = complete_eval(b, child_context)
|
217
|
-
shape = [rows.buffer[0], columns.buffer[0]]
|
218
|
-
eye_arr = Array.new(rows.buffer[0]) do |i|
|
219
|
-
Array.new(columns.buffer[0]) do |col|
|
220
|
-
if fp_type?(tensor.data_type)
|
221
|
-
i == col ? 1.0 : 0.0
|
222
|
-
else
|
223
|
-
i == col ? 1 : 0
|
224
|
-
end
|
225
|
-
end
|
226
|
-
end
|
249
|
+
register_op :log do |context, tensor, inputs|
|
250
|
+
execute_func('log', tensor, inputs[0], context)
|
251
|
+
end
|
227
252
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
|
232
|
-
|
233
|
-
padding = arr_pad(a, p, tensor.data_type)
|
234
|
-
convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
|
235
|
-
when :tile
|
236
|
-
input = read_final_result(complete_eval(a, child_context))
|
237
|
-
multiples = read_final_result(complete_eval(b, child_context))
|
238
|
-
|
239
|
-
rank = get_rank(input)
|
240
|
-
raise '1D or higher tensor required' if rank.zero?
|
241
|
-
raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
|
242
|
-
|
243
|
-
tile = tile_arr(input, 0, multiples)
|
244
|
-
arr = tile.nil? ? [] : tile
|
245
|
-
convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
|
246
|
-
when :assign
|
247
|
-
assign_var(tensor, b, child_context)
|
248
|
-
when :assign_add
|
249
|
-
a = _run(a, child_context)
|
250
|
-
b = _run(b, child_context)
|
251
|
-
value = execute_2_operand_func('add', tensor, a, b, child_context)
|
252
|
-
assign_var(tensor, value, child_context)
|
253
|
-
when :assign_sub
|
254
|
-
a = _run(a, child_context)
|
255
|
-
b = _run(b, child_context)
|
256
|
-
|
257
|
-
value = execute_2_operand_func('sub', tensor, a, b, child_context)
|
258
|
-
assign_var(tensor, value, child_context)
|
259
|
-
when :less
|
260
|
-
execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
|
261
|
-
when :less_equal
|
262
|
-
execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
|
263
|
-
when :greater
|
264
|
-
execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
|
265
|
-
when :greater_equal
|
266
|
-
execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
|
267
|
-
when :equal
|
268
|
-
execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
|
269
|
-
when :not_equal
|
270
|
-
execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
|
271
|
-
when :logical_and
|
272
|
-
execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
|
273
|
-
when :where
|
274
|
-
pred = tensor.options[:pred]
|
275
|
-
execute_cond_func('where', tensor, pred, a, b, child_context)
|
276
|
-
when :max
|
277
|
-
execute_2_operand_func('max', tensor, a, b, child_context)
|
278
|
-
when :add
|
279
|
-
execute_2_operand_func('add', tensor, a, b, child_context)
|
280
|
-
when :div
|
281
|
-
execute_2_operand_func('div', tensor, a, b, child_context)
|
282
|
-
when :sub
|
283
|
-
execute_2_operand_func('sub', tensor, a, b, child_context)
|
284
|
-
when :matmul
|
285
|
-
a = _run(a, child_context)
|
286
|
-
b = _run(b, child_context)
|
287
|
-
|
288
|
-
m = a.shape[0]
|
289
|
-
n = b.shape[1]
|
290
|
-
v = b.shape[0]
|
291
|
-
k = a.shape[1]
|
292
|
-
|
293
|
-
m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
|
294
|
-
n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
|
295
|
-
|
296
|
-
result_shape = [m, n]
|
297
|
-
|
298
|
-
raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
|
299
|
-
raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
|
300
|
-
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
301
|
-
|
302
|
-
dtype = tensor.data_type
|
303
|
-
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
304
|
-
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
305
|
-
|
306
|
-
cl_m = OpenCL::Int1.new(m)
|
307
|
-
cl_n = OpenCL::Int1.new(n)
|
308
|
-
cl_k = OpenCL::Int1.new(k)
|
309
|
-
|
310
|
-
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
311
|
-
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
312
|
-
|
313
|
-
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
314
|
-
output_buffer
|
315
|
-
when :mul
|
316
|
-
execute_2_operand_func('mul', tensor, a, b, child_context)
|
317
|
-
when :pow
|
318
|
-
execute_2_operand_func('pow', tensor, a, b, child_context)
|
319
|
-
when :cast
|
320
|
-
a = _run(a, child_context)
|
321
|
-
if a.data_type != tensor.data_type
|
322
|
-
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
323
|
-
m, n = a.shape
|
324
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
325
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
326
|
-
work_group = [m || 1, n || 1]
|
327
|
-
|
328
|
-
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
329
|
-
buffer
|
330
|
-
else
|
331
|
-
a
|
332
|
-
end
|
333
|
-
when :sign
|
334
|
-
execute_func('sign', tensor, a, child_context)
|
335
|
-
when :exp
|
336
|
-
execute_func('exp', tensor, a, child_context)
|
337
|
-
when :log
|
338
|
-
execute_func('log', tensor, a, child_context)
|
339
|
-
when :sin
|
340
|
-
execute_func('sin', tensor, a, child_context)
|
341
|
-
when :tan
|
342
|
-
execute_func('tan', tensor, a, child_context)
|
343
|
-
when :cos
|
344
|
-
execute_func('cos', tensor, a, child_context)
|
345
|
-
when :abs
|
346
|
-
execute_func('abs', tensor, a, child_context)
|
347
|
-
when :sqrt
|
348
|
-
execute_func('sqrt', tensor, a, child_context)
|
349
|
-
when :negate
|
350
|
-
execute_func('negate', tensor, a, child_context)
|
351
|
-
when :square
|
352
|
-
execute_func('square', tensor, a, child_context)
|
353
|
-
when :reciprocal
|
354
|
-
execute_func('reciprocal', tensor, a, child_context)
|
355
|
-
when :tanh
|
356
|
-
execute_func('tanh', tensor, a, child_context)
|
357
|
-
when :tanh_grad
|
358
|
-
execute_func('tanh_grad', tensor, a, child_context)
|
359
|
-
when :sigmoid
|
360
|
-
execute_func('sigmoid', tensor, a, child_context)
|
361
|
-
when :log1p
|
362
|
-
execute_func('log1p', tensor, a, child_context)
|
363
|
-
when :round
|
364
|
-
execute_func('round', tensor, a, child_context)
|
365
|
-
when :softmax
|
366
|
-
a = _run(a, child_context)
|
367
|
-
event_wait_list = [a.op].compact
|
368
|
-
dtype = tensor.data_type
|
369
|
-
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
253
|
+
register_op :sin do |context, tensor, inputs|
|
254
|
+
execute_func('sin', tensor, inputs[0], context)
|
255
|
+
end
|
370
256
|
|
371
|
-
|
372
|
-
|
373
|
-
n = m if n.nil?
|
374
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
257
|
+
register_op :cond do |context, tensor, inputs|
|
258
|
+
pred = complete_eval(tensor.options[:pred], context)
|
375
259
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
260
|
+
if all_true?(pred.buffer)
|
261
|
+
inputs[0]
|
262
|
+
else
|
263
|
+
inputs[1]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
register_op :identity do |_context, _tensor, inputs|
|
268
|
+
inputs[0]
|
269
|
+
end
|
270
|
+
|
271
|
+
register_op :assign, noop: true do |context, tensor, inputs|
|
272
|
+
assign_var(tensor, inputs[1], context)
|
273
|
+
end
|
274
|
+
|
275
|
+
register_op :assign_add do |context, tensor, inputs|
|
276
|
+
value = execute_2_operand_func('add', tensor, inputs[0], inputs[1], context)
|
277
|
+
assign_var(tensor, value, context)
|
278
|
+
end
|
279
|
+
|
280
|
+
register_op :assign_sub do |context, tensor, inputs|
|
281
|
+
value = execute_2_operand_func('sub', tensor, inputs[0], inputs[1], context)
|
282
|
+
assign_var(tensor, value, context)
|
283
|
+
end
|
284
|
+
|
285
|
+
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
286
|
+
register_op op, noop: true do |context, tensor, inputs|
|
287
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
|
288
|
+
end
|
289
|
+
end
|
385
290
|
|
291
|
+
%i[max add div sub mul pow sigmoid_grad].each do |op|
|
292
|
+
register_op op, noop: true do |context, tensor, inputs|
|
293
|
+
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
register_op :where, noop: true do |context, tensor, inputs|
|
298
|
+
pred = tensor.options[:pred]
|
299
|
+
execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
|
300
|
+
end
|
301
|
+
|
302
|
+
register_op :matmul do |_context, tensor, inputs|
|
303
|
+
a, b = inputs
|
304
|
+
|
305
|
+
m = a.shape[0]
|
306
|
+
n = b.shape[1]
|
307
|
+
v = b.shape[0]
|
308
|
+
k = a.shape[1]
|
309
|
+
|
310
|
+
m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
|
311
|
+
n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
|
312
|
+
|
313
|
+
result_shape = [m, n]
|
314
|
+
|
315
|
+
raise "#{tensor.inputs[0].name} rank must be greater than 1" if a.shape.size < 2
|
316
|
+
raise "#{tensor.inputs[1].name} rank must be greater than 1" if b.shape.size < 2
|
317
|
+
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
318
|
+
|
319
|
+
dtype = tensor.data_type
|
320
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
321
|
+
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
322
|
+
|
323
|
+
cl_m = OpenCL::Int1.new(m)
|
324
|
+
cl_n = OpenCL::Int1.new(n)
|
325
|
+
cl_k = OpenCL::Int1.new(k)
|
326
|
+
|
327
|
+
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
328
|
+
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
329
|
+
|
330
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
331
|
+
output_buffer
|
332
|
+
end
|
333
|
+
|
334
|
+
register_op :cast do |_context, tensor, inputs|
|
335
|
+
a = inputs[0]
|
336
|
+
if a.data_type != tensor.data_type
|
337
|
+
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
386
338
|
m, n = a.shape
|
387
|
-
|
388
|
-
n = m if n.nil?
|
339
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
389
340
|
cl_n = OpenCL::Int1.new(n || 1)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
341
|
+
work_group = [m || 1, n || 1]
|
342
|
+
|
343
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
344
|
+
buffer
|
345
|
+
else
|
346
|
+
a
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
%i[sign exp tan cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round].each do |op|
|
351
|
+
register_op op, noop: true do |context, tensor, inputs|
|
352
|
+
execute_func(op.to_s, tensor, inputs[0], context)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
register_op :softmax do |_context, tensor, inputs|
|
357
|
+
a = inputs[0]
|
358
|
+
event_wait_list = [a.op].compact
|
359
|
+
dtype = tensor.data_type
|
360
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
361
|
+
|
362
|
+
m, n = a.shape
|
363
|
+
work_group = [m]
|
364
|
+
n = m if n.nil?
|
365
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
366
|
+
|
367
|
+
event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
368
|
+
output_buffer.op = event
|
369
|
+
output_buffer
|
370
|
+
end
|
371
|
+
|
372
|
+
register_op :softmax_grad do |_context, tensor, inputs|
|
373
|
+
a, grad = inputs
|
374
|
+
|
375
|
+
event_wait_list = [a.op].compact
|
376
|
+
dtype = tensor.data_type
|
377
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
378
|
+
|
379
|
+
m, n = a.shape
|
380
|
+
work_group = [m]
|
381
|
+
n = m if n.nil?
|
382
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
383
|
+
event = _cl_program('softmax_grad', dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
384
|
+
output_buffer.op = event
|
385
|
+
output_buffer
|
386
|
+
end
|
387
|
+
|
388
|
+
register_op :truncate do |context, tensor, inputs|
|
389
|
+
a, b = inputs
|
390
|
+
if a.shape.size.zero?
|
391
|
+
a
|
392
|
+
else
|
393
|
+
input_b = read_final_result(b)
|
394
|
+
if a.shape == input_b
|
400
395
|
a
|
401
396
|
else
|
402
|
-
|
403
|
-
if
|
404
|
-
a
|
405
|
-
|
406
|
-
|
407
|
-
if input_b == []
|
408
|
-
if a.buffer.size == 1
|
409
|
-
a.shape = input_b
|
410
|
-
a
|
411
|
-
else
|
412
|
-
wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
|
413
|
-
end
|
397
|
+
input_a = read_final_result(a)
|
398
|
+
if input_b == []
|
399
|
+
if a.buffer.size == 1
|
400
|
+
a.shape = input_b
|
401
|
+
a
|
414
402
|
else
|
415
|
-
wrap_opencl(
|
403
|
+
wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
|
416
404
|
end
|
405
|
+
else
|
406
|
+
wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
|
417
407
|
end
|
418
408
|
end
|
419
|
-
|
420
|
-
|
421
|
-
name = tensor.options[:name]
|
409
|
+
end
|
410
|
+
end
|
422
411
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
a
|
427
|
-
when :zeros, :ones, :zeros_like, :ones_like
|
428
|
-
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
429
|
-
_run(a, child_context).shape
|
430
|
-
else
|
431
|
-
read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
|
432
|
-
end
|
412
|
+
register_op :check_numerics, noop: true do |context, tensor, inputs|
|
413
|
+
a = complete_eval(inputs[0], context)
|
414
|
+
name = tensor.options[:name]
|
433
415
|
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
416
|
+
a.buffer.each do |input|
|
417
|
+
raise "#{name} Invalid Argument" if input.nan? || input.infinite?
|
418
|
+
end
|
419
|
+
a
|
420
|
+
end
|
439
421
|
|
440
|
-
|
422
|
+
register_op :broadcast_transform do |context, tensor, inputs|
|
423
|
+
a, b = inputs
|
441
424
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
func.call
|
453
|
-
end
|
454
|
-
else
|
455
|
-
func.call
|
456
|
-
end
|
425
|
+
if a.shape == b.shape
|
426
|
+
[a, b]
|
427
|
+
else
|
428
|
+
input_a = read_final_result(complete_eval(a, context))
|
429
|
+
input_b = read_final_result(complete_eval(b, context))
|
430
|
+
b_a, b_b = broadcast(input_a, input_b)
|
431
|
+
[ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
|
432
|
+
wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
|
433
|
+
end
|
434
|
+
end
|
457
435
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
when :index
|
500
|
-
a = complete_eval(a, child_context)
|
501
|
-
input_a = read_final_result(a)
|
502
|
-
index = read_final_result(complete_eval(b, child_context))
|
503
|
-
|
504
|
-
if a.is_a?(Array)
|
505
|
-
a[index]
|
506
|
-
else
|
507
|
-
new_shape = a.shape.dup
|
508
|
-
new_shape.shift
|
509
|
-
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
510
|
-
end
|
511
|
-
when :broadcast_gradient_args
|
512
|
-
a = complete_eval(a, child_context)
|
513
|
-
b = complete_eval(b, child_context)
|
514
|
-
|
515
|
-
wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
|
516
|
-
when :shape
|
517
|
-
a = _run(a, child_context)
|
518
|
-
|
519
|
-
wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
|
520
|
-
when :reshape
|
521
|
-
arr = complete_eval(a, child_context)
|
522
|
-
new_shape = read_final_result(complete_eval(b, child_context))
|
523
|
-
|
524
|
-
if new_shape.size.zero? && arr.buffer.size == 1
|
525
|
-
arr.shape = new_shape
|
526
|
-
arr
|
527
|
-
else
|
528
|
-
new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
529
|
-
arr.shape = new_shape
|
530
|
-
arr
|
531
|
-
end
|
532
|
-
when :random_uniform
|
533
|
-
maxval = tensor.options.fetch(:maxval, 1)
|
534
|
-
minval = tensor.options.fetch(:minval, 0)
|
535
|
-
seed = tensor.options[:seed]
|
536
|
-
|
537
|
-
random = _get_randomizer(tensor, seed)
|
538
|
-
generator = -> { random.rand * (maxval - minval) + minval }
|
539
|
-
shape = tensor.options[:shape] || tensor.shape.shape
|
540
|
-
|
541
|
-
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
542
|
-
when :random_normal
|
543
|
-
random = _get_randomizer(tensor, seed)
|
544
|
-
r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
|
545
|
-
random = _get_randomizer(tensor, seed)
|
546
|
-
generator = -> { r.rand }
|
547
|
-
shape = tensor.options[:shape] || tensor.shape.shape
|
548
|
-
|
549
|
-
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
550
|
-
when :glorot_uniform
|
551
|
-
random = _get_randomizer(tensor, seed)
|
552
|
-
|
553
|
-
shape = tensor.options[:shape] || tensor.shape.shape
|
554
|
-
fan_in, fan_out = if shape.size.zero?
|
555
|
-
[1, 1]
|
556
|
-
elsif shape.size == 1
|
557
|
-
[1, shape[0]]
|
558
|
-
else
|
559
|
-
[shape[0], shape.last]
|
560
|
-
end
|
561
|
-
|
562
|
-
limit = Math.sqrt(6.0 / (fan_in + fan_out))
|
563
|
-
|
564
|
-
minval = -limit
|
565
|
-
maxval = limit
|
566
|
-
|
567
|
-
generator = -> { random.rand * (maxval - minval) + minval }
|
568
|
-
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
569
|
-
when :flow_group
|
570
|
-
tensor.items.collect { |item| _run(item, child_context) }
|
571
|
-
when :sum
|
572
|
-
reduction(child_context, tensor, a, b, :sum)
|
573
|
-
when :mean
|
574
|
-
reduction(child_context, tensor, a, b, :mean)
|
575
|
-
when :prod
|
576
|
-
input_a = complete_eval(a, child_context)
|
577
|
-
if input_a.buffer.empty?
|
578
|
-
convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
|
579
|
-
else
|
580
|
-
reduction(child_context, tensor, a, b, :prod)
|
581
|
-
end
|
582
|
-
when :argmin
|
583
|
-
a = complete_eval(a, child_context)
|
584
|
-
axis = tensor.options[:axis] || 0
|
585
|
-
arr = a.buffer.reshape(*a.shape.reverse).to_a
|
586
|
-
op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
|
587
|
-
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
588
|
-
when :argmax
|
589
|
-
a = complete_eval(a, child_context)
|
590
|
-
axis = tensor.options[:axis] || 0
|
591
|
-
arr = a.buffer.reshape(*a.shape.reverse).to_a
|
592
|
-
op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
|
593
|
-
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
436
|
+
register_op :print do |context, tensor, inputs|
|
437
|
+
a, b = inputs
|
438
|
+
input_b = complete_eval(b, context)
|
439
|
+
input_b = read_final_result(input_b)
|
440
|
+
puts "#{tensor.options.fetch(:message, '')} #{input_b}"
|
441
|
+
a
|
442
|
+
end
|
443
|
+
|
444
|
+
register_op :rank do |_context, tensor, inputs|
|
445
|
+
wrap_opencl(inputs[0].shape.size, data_type: tensor.data_type, name: tensor.name)
|
446
|
+
end
|
447
|
+
|
448
|
+
register_op :stop_gradient do |_context, _tensor, inputs|
|
449
|
+
inputs[0]
|
450
|
+
end
|
451
|
+
|
452
|
+
register_op :slice, noop: true do |context, tensor, inputs|
|
453
|
+
input_a = complete_eval(inputs[0], context)
|
454
|
+
input_b = read_final_result(complete_eval(inputs[1], context))
|
455
|
+
size = tensor.options[:size]
|
456
|
+
|
457
|
+
slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
|
458
|
+
|
459
|
+
new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
|
460
|
+
sliced = new_buf.slice[*slice_param]
|
461
|
+
convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
462
|
+
end
|
463
|
+
|
464
|
+
register_op :transpose, buffer: true do |_context, tensor, inputs|
|
465
|
+
t_param = Array.new(inputs[0].shape.size) { |index| index }.reverse
|
466
|
+
transposed = inputs[0].buffer.reshape(*inputs[0].shape.reverse).transpose(*t_param)
|
467
|
+
convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
468
|
+
end
|
469
|
+
|
470
|
+
register_op :index, buffer: true do |_context, tensor, inputs|
|
471
|
+
a = inputs[0]
|
472
|
+
input_a = read_final_result(a)
|
473
|
+
index = read_final_result(inputs[1])
|
474
|
+
|
475
|
+
if a.is_a?(Array)
|
476
|
+
a[index]
|
594
477
|
else
|
595
|
-
|
596
|
-
|
478
|
+
new_shape = a.shape.dup
|
479
|
+
new_shape.shift
|
480
|
+
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
481
|
+
end
|
482
|
+
end
|
483
|
+
|
484
|
+
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
485
|
+
wrap_opencl(get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a), data_type: inputs[0].data_type, name: tensor.name)
|
486
|
+
end
|
487
|
+
|
488
|
+
register_op :shape do |_context, tensor, inputs|
|
489
|
+
wrap_opencl(inputs[0].shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
|
490
|
+
end
|
491
|
+
|
492
|
+
register_op :reshape, buffer: true do |_context, _tensor, inputs|
|
493
|
+
arr = inputs[0]
|
494
|
+
new_shape = read_final_result(inputs[1])
|
495
|
+
|
496
|
+
if new_shape.size.zero? && arr.buffer.size == 1
|
497
|
+
arr.shape = new_shape
|
498
|
+
arr
|
499
|
+
else
|
500
|
+
new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
501
|
+
arr.shape = new_shape
|
502
|
+
arr
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
register_op :flow_group do |_context, _tensor, inputs|
|
507
|
+
inputs
|
508
|
+
end
|
509
|
+
|
510
|
+
%i[sum mean].each do |op|
|
511
|
+
register_op op, noop: true do |context, tensor, inputs|
|
512
|
+
reduction(context, tensor, inputs[0], inputs[1], op.to_sym)
|
513
|
+
end
|
514
|
+
end
|
515
|
+
|
516
|
+
register_op :prod, noop: true do |context, tensor, inputs|
|
517
|
+
input_a = complete_eval(inputs[0], context)
|
518
|
+
if input_a.buffer.empty?
|
519
|
+
convert_to_opencl([1.0], [], data_type: inputs[0].data_type, name: tensor.name)
|
520
|
+
else
|
521
|
+
reduction(context, tensor, inputs[0], inputs[1], :prod)
|
522
|
+
end
|
523
|
+
end
|
524
|
+
|
525
|
+
register_op :argmin, buffer: true do |_context, tensor, inputs|
|
526
|
+
axis = tensor.options[:axis] || 0
|
527
|
+
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
528
|
+
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
529
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
530
|
+
end
|
531
|
+
|
532
|
+
register_op :argmax, buffer: true do |_context, tensor, inputs|
|
533
|
+
axis = tensor.options[:axis] || 0
|
534
|
+
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
535
|
+
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
|
536
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
537
|
+
end
|
538
|
+
|
539
|
+
def eval_operation(tensor, child_context)
|
540
|
+
return @context[tensor.name] if @context.key?(tensor.name)
|
541
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
542
|
+
return @context[cache_key] if @context.key?(cache_key)
|
543
|
+
# puts tensor.name
|
544
|
+
invoke(tensor, child_context).tap do |result|
|
597
545
|
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
598
546
|
if tensor.breakpoint
|
547
|
+
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
548
|
+
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
599
549
|
a = read_final_result(complete_eval(a, child_context))
|
600
550
|
b = read_final_result(complete_eval(b, child_context))
|
601
551
|
result = read_final_result(complete_eval(result, child_context))
|
@@ -642,7 +592,7 @@ module TensorStream
|
|
642
592
|
def eval_tensor(tensor, child_context)
|
643
593
|
return tensor unless tensor.is_a?(Tensor)
|
644
594
|
|
645
|
-
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
|
595
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}:#{object_id}"
|
646
596
|
return @context[cache_key] if @context.key?(cache_key)
|
647
597
|
return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
|
648
598
|
@context[cache_key] = if tensor.value.is_a?(Tensor)
|
@@ -656,7 +606,7 @@ module TensorStream
|
|
656
606
|
private
|
657
607
|
|
658
608
|
def assign_var(tensor, b, child_context)
|
659
|
-
assign = tensor.
|
609
|
+
assign = tensor.inputs[0] || tensor
|
660
610
|
buffer = complete_eval(b, child_context)
|
661
611
|
|
662
612
|
if assign.buffer
|
@@ -678,7 +628,7 @@ module TensorStream
|
|
678
628
|
dtype = tensor.data_type
|
679
629
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
680
630
|
|
681
|
-
output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
|
631
|
+
output_buffer = _create_result_buffer(tensor.data_type, result_shape, "out_#{tensor.name}")
|
682
632
|
a, b, prog, switch_operands = select_program(a, b, op_name)
|
683
633
|
m, n = result_shape
|
684
634
|
work_group = [m || 1, n || 1]
|
@@ -688,6 +638,7 @@ module TensorStream
|
|
688
638
|
|
689
639
|
event_wait_list = [a.op, b.op].compact # add dependency wait list
|
690
640
|
|
641
|
+
method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
|
691
642
|
event = if prog == "#{op_name}_b"
|
692
643
|
cl_m_b, cl_n_b = if b.shape.size == 2
|
693
644
|
[ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
|
@@ -696,9 +647,9 @@ module TensorStream
|
|
696
647
|
else
|
697
648
|
raise "rank > 2 not supported!"
|
698
649
|
end
|
699
|
-
_cl_program("#{prog_name || op_name}", dtype: dtype).send(
|
650
|
+
_cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
700
651
|
else
|
701
|
-
_cl_program("#{prog_name || op_name}", dtype: dtype).send(
|
652
|
+
_cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
702
653
|
end
|
703
654
|
|
704
655
|
output_buffer.op = event
|
@@ -784,7 +735,7 @@ module TensorStream
|
|
784
735
|
value = [value]
|
785
736
|
end
|
786
737
|
|
787
|
-
cache_key = "_cl_object_#{name}
|
738
|
+
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
788
739
|
cl_object = if name && @context[:_cache][cache_key]
|
789
740
|
@context[:_cache][cache_key]
|
790
741
|
else
|
@@ -813,13 +764,13 @@ module TensorStream
|
|
813
764
|
if element.is_a?(Tensor)
|
814
765
|
cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
|
815
766
|
else
|
816
|
-
cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
|
767
|
+
cl_object.buffer[index] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(element, data_type))
|
817
768
|
end
|
818
769
|
end
|
819
770
|
elsif value.is_a?(NArray)
|
820
771
|
cl_object.buffer = value
|
821
772
|
else
|
822
|
-
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
773
|
+
cl_object.buffer[0] = ( data_type == :boolean ? ( element ? 1 : 0 ) : Tensor.cast_dtype(value, data_type))
|
823
774
|
end
|
824
775
|
|
825
776
|
write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
@@ -840,14 +791,14 @@ module TensorStream
|
|
840
791
|
when :int16
|
841
792
|
NArray.sint(narray_size)
|
842
793
|
when :boolean
|
843
|
-
NArray.
|
794
|
+
NArray.sint(narray_size)
|
844
795
|
else
|
845
796
|
raise "unsupported type #{data_type}"
|
846
797
|
end
|
847
798
|
end
|
848
799
|
|
849
800
|
def _create_result_buffer(data_type, shape, name)
|
850
|
-
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
801
|
+
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}:#{object_id}"] ||= begin
|
851
802
|
size = shape.empty? ? 1 : shape.reduce(:*)
|
852
803
|
buffer = allocate_narray_for_type(data_type, size)
|
853
804
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
@@ -1029,7 +980,6 @@ module TensorStream
|
|
1029
980
|
|
1030
981
|
def resolve_placeholder(placeholder, _execution_context = {})
|
1031
982
|
return nil if placeholder.nil?
|
1032
|
-
return placeholder if retain.include?(placeholder)
|
1033
983
|
|
1034
984
|
var = if placeholder.is_a?(Placeholder)
|
1035
985
|
@context[placeholder.name.to_sym].tap do |c|
|
@@ -1056,7 +1006,7 @@ module TensorStream
|
|
1056
1006
|
reduced_val = r[0]
|
1057
1007
|
if r.size > 1
|
1058
1008
|
reduced_val = f.call(r[0..val.size])
|
1059
|
-
elsif r.size
|
1009
|
+
elsif r.size.zero?
|
1060
1010
|
reduced_val = f.call(nil)
|
1061
1011
|
end
|
1062
1012
|
keep_dims ? [ reduced_val ] : reduced_val
|
@@ -1143,3 +1093,5 @@ module TensorStream
|
|
1143
1093
|
end
|
1144
1094
|
end
|
1145
1095
|
end
|
1096
|
+
|
1097
|
+
TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)
|