tensor_stream 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.circleci/config.yml +7 -7
- data/CHANGELOG.md +13 -0
- data/Dockerfile +25 -0
- data/Rakefile +6 -0
- data/benchmark/benchmark.rb +16 -57
- data/benchmark_intel.txt +21 -0
- data/benchmark_nvidia.txt +33 -0
- data/lib/tensor_stream.rb +4 -173
- data/lib/tensor_stream/debugging/debugging.rb +20 -0
- data/lib/tensor_stream/evaluator/kernels/abs.cl +9 -5
- data/lib/tensor_stream/evaluator/kernels/add.cl +2 -4
- data/lib/tensor_stream/evaluator/kernels/argmax.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/argmin.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/cast.cl +3 -8
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +1 -1
- data/lib/tensor_stream/evaluator/kernels/cos.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/div.cl.erb +2 -4
- data/lib/tensor_stream/evaluator/kernels/exp.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/gemm.cl +8 -39
- data/lib/tensor_stream/evaluator/kernels/log.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/log1p.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/max.cl +4 -49
- data/lib/tensor_stream/evaluator/kernels/mul.cl +2 -4
- data/lib/tensor_stream/evaluator/kernels/negate.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/pow.cl +4 -88
- data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/round.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +6 -5
- data/lib/tensor_stream/evaluator/kernels/sign.cl +12 -14
- data/lib/tensor_stream/evaluator/kernels/sin.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/softmax.cl +26 -0
- data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl +46 -0
- data/lib/tensor_stream/evaluator/kernels/sqrt.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/square.cl +2 -8
- data/lib/tensor_stream/evaluator/kernels/sub.cl +2 -4
- data/lib/tensor_stream/evaluator/kernels/tan.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/tanh.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/where.cl +2 -9
- data/lib/tensor_stream/evaluator/opencl_evaluator.rb +108 -58
- data/lib/tensor_stream/evaluator/opencl_template_helper.rb +40 -5
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +35 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +30 -9
- data/lib/tensor_stream/graph_serializers/graphml.rb +1 -1
- data/lib/tensor_stream/graph_serializers/pbtext.rb +4 -0
- data/lib/tensor_stream/math_gradients.rb +6 -5
- data/lib/tensor_stream/nn/nn_ops.rb +18 -2
- data/lib/tensor_stream/ops.rb +237 -44
- data/lib/tensor_stream/tensor.rb +16 -2
- data/lib/tensor_stream/utils.rb +205 -0
- data/lib/tensor_stream/variable.rb +2 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/error.graphml +2755 -0
- data/{test_samples → samples}/iris.rb +18 -24
- data/samples/logistic_regression.rb +0 -1
- data/test_samples/raw_neural_net_sample.rb +80 -23
- metadata +11 -3
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,5 +1,6 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
1
2
|
|
2
|
-
__kernel void
|
3
|
+
__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
4
|
// Get the index of the current element to be processed
|
4
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,14 +1,15 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
1
2
|
|
2
|
-
float sigmoid(
|
3
|
+
float sigmoid(<%= c_dtype %> x) {
|
3
4
|
return 1.0f/(1.0f + exp(-x));
|
4
5
|
}
|
5
6
|
|
6
|
-
float sigmoid_grad(
|
7
|
+
float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
7
8
|
return g * sigmoid(x) * ( 1.0f - sigmoid(x));
|
8
9
|
}
|
9
10
|
|
10
11
|
// same dimension add floating point op
|
11
|
-
__kernel void
|
12
|
+
__kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
13
|
// Get the index of the current element to be processed
|
13
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -17,7 +18,7 @@ float sigmoid_grad(float x, float g) {
|
|
17
18
|
}
|
18
19
|
|
19
20
|
// 1D + Scalar floating point add op
|
20
|
-
__kernel void
|
21
|
+
__kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
21
22
|
// Get the index of the current element to be processed
|
22
23
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
23
24
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -30,7 +31,7 @@ float sigmoid_grad(float x, float g) {
|
|
30
31
|
}
|
31
32
|
|
32
33
|
// 1D + Scalar floating point add op broadcast
|
33
|
-
__kernel void
|
34
|
+
__kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
34
35
|
// Get the index of the current element to be processed
|
35
36
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
36
37
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,23 +1,21 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
4
|
// Get the index of the current element to be processed
|
3
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
-
|
7
|
+
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
8
|
+
% if is_floating_point?(dtype)
|
6
9
|
if (isnan(value) || value == 0.0f) {
|
7
10
|
C[globalRow * N + globalCol] = 0.0;
|
8
11
|
} else {
|
9
12
|
C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
|
10
13
|
}
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
if (isnan(value) || value == 0) {
|
19
|
-
C[globalRow * N + globalCol] = 0;
|
20
|
-
} else {
|
21
|
-
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
22
|
-
}
|
14
|
+
% else
|
15
|
+
if (value == 0) {
|
16
|
+
C[globalRow * N + globalCol] = 0;
|
17
|
+
} else {
|
18
|
+
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
19
|
+
}
|
20
|
+
% end
|
23
21
|
}
|
@@ -1,5 +1,6 @@
|
|
1
1
|
|
2
|
-
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
4
|
// Get the index of the current element to be processed
|
4
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void softmax_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
<%= c_dtype %> acc = 0.0f;
|
12
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
+
|
14
|
+
for (int k=0; k<N; k++) {
|
15
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
+
}
|
17
|
+
|
18
|
+
for (int k=0; k<N; k++) {
|
19
|
+
acc += exp(A[globalRow*N + k] - max);
|
20
|
+
}
|
21
|
+
|
22
|
+
// Store the result
|
23
|
+
for (int k=0; k < N; k++) {
|
24
|
+
C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void softmax_grad_<%= dtype %>(const int N,
|
3
|
+
const __global <%= c_dtype %>* A,
|
4
|
+
const __global <%= c_dtype %>* G,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
float acc = 0.0f;
|
12
|
+
float max = FLT_MIN;
|
13
|
+
float row[<%= size %>];
|
14
|
+
float grads[<%= size %>][<%= size %>];
|
15
|
+
|
16
|
+
for (int k=0; k<N; k++) {
|
17
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
+
}
|
19
|
+
|
20
|
+
for (int k=0; k<N; k++) {
|
21
|
+
acc += exp(A[globalRow*N + k] - max);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Store the result
|
25
|
+
for (int k=0; k < N; k++) {
|
26
|
+
row[k] = exp(A[globalRow*N + k] - max) / acc;
|
27
|
+
}
|
28
|
+
|
29
|
+
for (int a=0; a < N; a++) {
|
30
|
+
for(int b=0; b < N; b++) {
|
31
|
+
if (a != b) {
|
32
|
+
grads[a][b] = -row[a] * row[b];
|
33
|
+
} else {
|
34
|
+
grads[a][b] = row[a] * (1.0f - row[a]);
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
for (int k=0; k < N; k++) {
|
40
|
+
float total_grad = 0.0f;
|
41
|
+
for (int a = 0; a < N; a++) {
|
42
|
+
total_grad += grads[a][k] * G[globalRow*N + a];
|
43
|
+
}
|
44
|
+
C[globalRow*N + k] = total_grad;
|
45
|
+
}
|
46
|
+
}
|
@@ -1,5 +1,6 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
1
2
|
|
2
|
-
__kernel void
|
3
|
+
__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
4
|
// Get the index of the current element to be processed
|
4
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,12 +1,6 @@
|
|
1
|
-
|
2
|
-
// Get the index of the current element to be processed
|
3
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
-
|
6
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
|
7
|
-
}
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
8
2
|
|
9
|
-
__kernel void
|
3
|
+
__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
10
4
|
// Get the index of the current element to be processed
|
11
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
% %w[fp int].product(%w[sub]).each do |dtype, fname|
|
2
1
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
% op = operator_to_c(
|
4
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname:
|
5
|
-
% end
|
2
|
+
% op = operator_to_c('sub')
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,12 +1,5 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
-
|
6
|
-
C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
7
|
-
}
|
8
|
-
|
9
|
-
__kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
10
3
|
// Get the index of the current element to be processed
|
11
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -41,20 +41,18 @@ module TensorStream
|
|
41
41
|
@preferred_device = preferred_device
|
42
42
|
@retain = context[:retain] || []
|
43
43
|
@thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
|
44
|
-
|
44
|
+
@context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
|
45
45
|
@context[:compute_history] = [] if log_intermediates
|
46
46
|
end
|
47
47
|
|
48
48
|
# opencl evaluator main entrypoint
|
49
49
|
def run(tensor, execution_context)
|
50
50
|
_create_opencl_context
|
51
|
-
|
52
|
-
|
51
|
+
create_command_queue
|
53
52
|
read_final_result(complete_eval(tensor, execution_context))
|
54
53
|
end
|
55
54
|
|
56
55
|
def complete_eval(tensor, context)
|
57
|
-
create_command_queue
|
58
56
|
buffer = _run(tensor, context)
|
59
57
|
if buffer.is_a?(Array)
|
60
58
|
buffer = buffer.collect do |b|
|
@@ -66,7 +64,6 @@ module TensorStream
|
|
66
64
|
return buffer if buffer.nil? || buffer.buffer.size.zero?
|
67
65
|
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
|
68
66
|
end
|
69
|
-
|
70
67
|
_opencl_queue.finish
|
71
68
|
buffer
|
72
69
|
end
|
@@ -91,15 +88,18 @@ module TensorStream
|
|
91
88
|
@preferred_device
|
92
89
|
else
|
93
90
|
device, _score, _platform, _index = choose_best_device
|
91
|
+
# puts "using #{device.name}"
|
94
92
|
device
|
95
93
|
end
|
96
94
|
end
|
95
|
+
@context[:cl_device] = opencl_device
|
97
96
|
@context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
|
98
97
|
end
|
99
98
|
|
100
99
|
def choose_best_device
|
101
100
|
@best_device ||= begin
|
102
101
|
devices = OpenCL.platforms.flat_map do |p|
|
102
|
+
|
103
103
|
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
104
104
|
score = 0
|
105
105
|
if d.type.to_s == 'CPU'
|
@@ -108,13 +108,18 @@ module TensorStream
|
|
108
108
|
score += 4
|
109
109
|
end
|
110
110
|
|
111
|
+
if d.platform.name == 'NVIDIA CUDA'
|
112
|
+
score += 1000
|
113
|
+
end
|
114
|
+
|
111
115
|
score += d.max_compute_units
|
116
|
+
score += d.max_clock_frequency
|
112
117
|
|
113
118
|
[d, score, p.name, index]
|
114
119
|
end
|
115
120
|
end
|
121
|
+
devices.sort { |a| a[1] }.reverse.first
|
116
122
|
end
|
117
|
-
devices.max { |a| a[1] }
|
118
123
|
end
|
119
124
|
|
120
125
|
def create_command_queue
|
@@ -137,11 +142,13 @@ module TensorStream
|
|
137
142
|
File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
|
138
143
|
end
|
139
144
|
|
140
|
-
def _cl_program(kernel)
|
141
|
-
|
145
|
+
def _cl_program(kernel, args = {})
|
146
|
+
suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
|
147
|
+
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
|
142
148
|
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
143
149
|
source = File.read(filename)
|
144
|
-
source = OpenclTemplateHelper.new(source).generate
|
150
|
+
source = OpenclTemplateHelper.new(source).generate(args)
|
151
|
+
File.write("/tmp/#{kernel}.#{suffix}.cl", source)
|
145
152
|
program = _opencl_context.create_program_with_source(source)
|
146
153
|
program.build
|
147
154
|
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
@@ -152,7 +159,9 @@ module TensorStream
|
|
152
159
|
|
153
160
|
def _run(tensor, execution_context)
|
154
161
|
return tensor if tensor.is_a?(OpenCLBuffer)
|
155
|
-
|
162
|
+
if tensor.is_a?(Array) && tensor.size > 0 && tensor[0].is_a?(Tensor)
|
163
|
+
return tensor.map { |t| _run(t, execution_context) }
|
164
|
+
end
|
156
165
|
|
157
166
|
return tensor if retain.include?(tensor) # if var is in retain don't eval to value
|
158
167
|
|
@@ -180,10 +189,11 @@ module TensorStream
|
|
180
189
|
|
181
190
|
def eval_operation(tensor, child_context)
|
182
191
|
return @context[tensor.name] if @context.key?(tensor.name)
|
183
|
-
|
192
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
|
193
|
+
return @context[cache_key] if @context.key?(cache_key)
|
184
194
|
a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
|
185
195
|
b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
|
186
|
-
|
196
|
+
# puts tensor.name
|
187
197
|
case tensor.operation
|
188
198
|
when :concat
|
189
199
|
input_a = read_final_result(complete_eval(a, child_context))
|
@@ -238,7 +248,6 @@ module TensorStream
|
|
238
248
|
when :assign_add
|
239
249
|
a = _run(a, child_context)
|
240
250
|
b = _run(b, child_context)
|
241
|
-
|
242
251
|
value = execute_2_operand_func('add', tensor, a, b, child_context)
|
243
252
|
assign_var(tensor, value, child_context)
|
244
253
|
when :assign_sub
|
@@ -290,8 +299,8 @@ module TensorStream
|
|
290
299
|
raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
|
291
300
|
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
292
301
|
|
293
|
-
dtype =
|
294
|
-
a, b =
|
302
|
+
dtype = tensor.data_type
|
303
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
295
304
|
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
296
305
|
|
297
306
|
cl_m = OpenCL::Int1.new(m)
|
@@ -301,7 +310,7 @@ module TensorStream
|
|
301
310
|
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
302
311
|
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
303
312
|
|
304
|
-
output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
313
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
305
314
|
output_buffer
|
306
315
|
when :mul
|
307
316
|
execute_2_operand_func('mul', tensor, a, b, child_context)
|
@@ -311,14 +320,12 @@ module TensorStream
|
|
311
320
|
a = _run(a, child_context)
|
312
321
|
if a.data_type != tensor.data_type
|
313
322
|
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
314
|
-
s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
|
315
|
-
t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
316
323
|
m, n = a.shape
|
317
324
|
cl_m = OpenCL::Int1.new(m || 1)
|
318
325
|
cl_n = OpenCL::Int1.new(n || 1)
|
319
326
|
work_group = [m || 1, n || 1]
|
320
327
|
|
321
|
-
buffer.op = _cl_program("cast").
|
328
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
322
329
|
buffer
|
323
330
|
else
|
324
331
|
a
|
@@ -355,6 +362,34 @@ module TensorStream
|
|
355
362
|
execute_func('log1p', tensor, a, child_context)
|
356
363
|
when :round
|
357
364
|
execute_func('round', tensor, a, child_context)
|
365
|
+
when :softmax
|
366
|
+
a = _run(a, child_context)
|
367
|
+
event_wait_list = [a.op].compact
|
368
|
+
dtype = tensor.data_type
|
369
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
370
|
+
|
371
|
+
m, n = a.shape
|
372
|
+
work_group = [m]
|
373
|
+
n = m if n.nil?
|
374
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
375
|
+
|
376
|
+
event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
377
|
+
output_buffer.op = event
|
378
|
+
output_buffer
|
379
|
+
when :softmax_grad
|
380
|
+
a = _run(a, child_context)
|
381
|
+
grad = _run(b, child_context)
|
382
|
+
event_wait_list = [a.op].compact
|
383
|
+
dtype = tensor.data_type
|
384
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
385
|
+
|
386
|
+
m, n = a.shape
|
387
|
+
work_group = [m]
|
388
|
+
n = m if n.nil?
|
389
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
390
|
+
event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
391
|
+
output_buffer.op = event
|
392
|
+
output_buffer
|
358
393
|
when :sigmoid_grad
|
359
394
|
execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
|
360
395
|
when :truncate
|
@@ -381,6 +416,14 @@ module TensorStream
|
|
381
416
|
end
|
382
417
|
end
|
383
418
|
end
|
419
|
+
when :check_numerics
|
420
|
+
a = complete_eval(a, child_context)
|
421
|
+
name = tensor.options[:name]
|
422
|
+
|
423
|
+
a.buffer.each do |item|
|
424
|
+
raise "#{name} Invalid Argument" if item.nan? || item.infinite?
|
425
|
+
end
|
426
|
+
a
|
384
427
|
when :zeros, :ones, :zeros_like, :ones_like
|
385
428
|
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
386
429
|
_run(a, child_context).shape
|
@@ -551,6 +594,7 @@ module TensorStream
|
|
551
594
|
else
|
552
595
|
raise "unknown op #{tensor.operation}"
|
553
596
|
end.tap do |result|
|
597
|
+
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
554
598
|
if tensor.breakpoint
|
555
599
|
a = read_final_result(complete_eval(a, child_context))
|
556
600
|
b = read_final_result(complete_eval(b, child_context))
|
@@ -568,11 +612,13 @@ module TensorStream
|
|
568
612
|
value: result
|
569
613
|
}
|
570
614
|
end
|
615
|
+
@context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
|
571
616
|
@context[tensor.name] = result
|
572
617
|
end
|
573
618
|
rescue EvaluatorExcecutionException => e
|
574
619
|
raise e
|
575
620
|
rescue StandardError => e
|
621
|
+
_opencl_queue.finish # dump queue
|
576
622
|
puts e.message
|
577
623
|
puts e.backtrace.join("\n")
|
578
624
|
|
@@ -612,8 +658,12 @@ module TensorStream
|
|
612
658
|
def assign_var(tensor, b, child_context)
|
613
659
|
assign = tensor.items[0] || tensor
|
614
660
|
buffer = complete_eval(b, child_context)
|
661
|
+
|
615
662
|
if assign.buffer
|
616
|
-
|
663
|
+
buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
664
|
+
if assign.buffer.cl_buffer != buffer.cl_buffer
|
665
|
+
assign.buffer.op = _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
|
666
|
+
end
|
617
667
|
else
|
618
668
|
assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
|
619
669
|
end
|
@@ -624,8 +674,8 @@ module TensorStream
|
|
624
674
|
def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
|
625
675
|
a = _run(input_a, child_context)
|
626
676
|
b = _run(input_b, child_context)
|
627
|
-
a, b =
|
628
|
-
dtype =
|
677
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
678
|
+
dtype = tensor.data_type
|
629
679
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
630
680
|
|
631
681
|
output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
|
@@ -646,9 +696,9 @@ module TensorStream
|
|
646
696
|
else
|
647
697
|
raise "rank > 2 not supported!"
|
648
698
|
end
|
649
|
-
_cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
699
|
+
_cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
650
700
|
else
|
651
|
-
_cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
701
|
+
_cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
652
702
|
end
|
653
703
|
|
654
704
|
output_buffer.op = event
|
@@ -660,8 +710,8 @@ module TensorStream
|
|
660
710
|
a = _run(input_a, child_context)
|
661
711
|
b = _run(input_b, child_context)
|
662
712
|
|
663
|
-
a, b =
|
664
|
-
dtype =
|
713
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
714
|
+
dtype = tensor.data_type
|
665
715
|
|
666
716
|
output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
|
667
717
|
|
@@ -671,14 +721,14 @@ module TensorStream
|
|
671
721
|
cl_n = OpenCL::Int1.new(n || 1)
|
672
722
|
|
673
723
|
event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
|
674
|
-
output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
724
|
+
output_buffer.op = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
675
725
|
output_buffer
|
676
726
|
end
|
677
727
|
|
678
728
|
def execute_func(op_name, tensor, a, child_context)
|
679
729
|
a = _run(a, child_context)
|
680
|
-
event_wait_list = [a.op].compact
|
681
|
-
dtype =
|
730
|
+
event_wait_list = [a.op].compact
|
731
|
+
dtype = tensor.data_type
|
682
732
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
683
733
|
|
684
734
|
m, n = a.shape
|
@@ -686,43 +736,37 @@ module TensorStream
|
|
686
736
|
cl_m = OpenCL::Int1.new(m || 1)
|
687
737
|
cl_n = OpenCL::Int1.new(n || 1)
|
688
738
|
|
689
|
-
event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
739
|
+
event = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
690
740
|
output_buffer.op = event
|
691
741
|
output_buffer
|
692
742
|
end
|
693
743
|
|
694
|
-
def
|
744
|
+
def auto_type_cast(a, b, name: nil)
|
695
745
|
return [a, b] if a.data_type == b.data_type
|
696
746
|
m, n = b.shape
|
697
747
|
work_group = [m || 1, n || 1]
|
698
|
-
|
699
|
-
|
700
|
-
if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
|
701
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
702
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
748
|
+
event_wait_list = [b.op].compact
|
749
|
+
buffer = _create_result_buffer(b.data_type, b.shape, name)
|
703
750
|
|
704
|
-
|
705
|
-
|
706
|
-
end
|
707
|
-
elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
|
708
|
-
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
|
709
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
710
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
711
|
-
_cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
|
712
|
-
return [a, buffer]
|
713
|
-
end
|
714
|
-
end
|
751
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
752
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
715
753
|
|
716
|
-
|
754
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
755
|
+
[a, buffer]
|
717
756
|
end
|
718
757
|
|
719
|
-
def
|
720
|
-
|
758
|
+
def type_cast(source, data_type, name: nil)
|
759
|
+
return source if source.data_type == data_type
|
760
|
+
m, n = source.shape
|
761
|
+
work_group = [m || 1, n || 1]
|
762
|
+
event_wait_list = [source.op].compact
|
763
|
+
buffer = _create_result_buffer(data_type, source.shape, name)
|
721
764
|
|
722
|
-
|
765
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
766
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
723
767
|
|
724
|
-
|
725
|
-
|
768
|
+
buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
769
|
+
buffer
|
726
770
|
end
|
727
771
|
|
728
772
|
def wrap_opencl(tensor, data_type: nil, name: nil)
|
@@ -786,11 +830,16 @@ module TensorStream
|
|
786
830
|
end
|
787
831
|
|
788
832
|
def allocate_narray_for_type(data_type, narray_size)
|
789
|
-
|
833
|
+
case data_type
|
834
|
+
when :float, :float32
|
790
835
|
NArray.sfloat(narray_size)
|
791
|
-
|
836
|
+
when :float64
|
837
|
+
NArray.float(narray_size)
|
838
|
+
when :int, :int32, :int64
|
792
839
|
NArray.int(narray_size)
|
793
|
-
|
840
|
+
when :int16
|
841
|
+
NArray.sint(narray_size)
|
842
|
+
when :boolean
|
794
843
|
NArray.int(narray_size)
|
795
844
|
else
|
796
845
|
raise "unsupported type #{data_type}"
|
@@ -798,7 +847,7 @@ module TensorStream
|
|
798
847
|
end
|
799
848
|
|
800
849
|
def _create_result_buffer(data_type, shape, name)
|
801
|
-
@context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
850
|
+
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
802
851
|
size = shape.empty? ? 1 : shape.reduce(:*)
|
803
852
|
buffer = allocate_narray_for_type(data_type, size)
|
804
853
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
@@ -843,7 +892,8 @@ module TensorStream
|
|
843
892
|
input = complete_eval(a, child_context)
|
844
893
|
axis = read_final_result(complete_eval(b, child_context))
|
845
894
|
if axis.nil?
|
846
|
-
|
895
|
+
red = input.buffer.send(func)
|
896
|
+
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
847
897
|
else
|
848
898
|
return input if input.shape.empty?
|
849
899
|
value = input.buffer.reshape(*input.shape.reverse)
|