tensor_stream 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +7 -7
- data/CHANGELOG.md +13 -0
- data/Dockerfile +25 -0
- data/Rakefile +6 -0
- data/benchmark/benchmark.rb +16 -57
- data/benchmark_intel.txt +21 -0
- data/benchmark_nvidia.txt +33 -0
- data/lib/tensor_stream.rb +4 -173
- data/lib/tensor_stream/debugging/debugging.rb +20 -0
- data/lib/tensor_stream/evaluator/kernels/abs.cl +9 -5
- data/lib/tensor_stream/evaluator/kernels/add.cl +2 -4
- data/lib/tensor_stream/evaluator/kernels/argmax.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/argmin.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/cast.cl +3 -8
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +1 -1
- data/lib/tensor_stream/evaluator/kernels/cos.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/div.cl.erb +2 -4
- data/lib/tensor_stream/evaluator/kernels/exp.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/gemm.cl +8 -39
- data/lib/tensor_stream/evaluator/kernels/log.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/log1p.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/max.cl +4 -49
- data/lib/tensor_stream/evaluator/kernels/mul.cl +2 -4
- data/lib/tensor_stream/evaluator/kernels/negate.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/pow.cl +4 -88
- data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +2 -9
- data/lib/tensor_stream/evaluator/kernels/round.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +6 -5
- data/lib/tensor_stream/evaluator/kernels/sign.cl +12 -14
- data/lib/tensor_stream/evaluator/kernels/sin.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/softmax.cl +26 -0
- data/lib/tensor_stream/evaluator/kernels/softmax_grad.cl +46 -0
- data/lib/tensor_stream/evaluator/kernels/sqrt.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/square.cl +2 -8
- data/lib/tensor_stream/evaluator/kernels/sub.cl +2 -4
- data/lib/tensor_stream/evaluator/kernels/tan.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/tanh.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +2 -1
- data/lib/tensor_stream/evaluator/kernels/where.cl +2 -9
- data/lib/tensor_stream/evaluator/opencl_evaluator.rb +108 -58
- data/lib/tensor_stream/evaluator/opencl_template_helper.rb +40 -5
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +35 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +30 -9
- data/lib/tensor_stream/graph_serializers/graphml.rb +1 -1
- data/lib/tensor_stream/graph_serializers/pbtext.rb +4 -0
- data/lib/tensor_stream/math_gradients.rb +6 -5
- data/lib/tensor_stream/nn/nn_ops.rb +18 -2
- data/lib/tensor_stream/ops.rb +237 -44
- data/lib/tensor_stream/tensor.rb +16 -2
- data/lib/tensor_stream/utils.rb +205 -0
- data/lib/tensor_stream/variable.rb +2 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/error.graphml +2755 -0
- data/{test_samples → samples}/iris.rb +18 -24
- data/samples/logistic_regression.rb +0 -1
- data/test_samples/raw_neural_net_sample.rb +80 -23
- metadata +11 -3
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void round_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,5 +1,6 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
1
2
|
|
2
|
-
__kernel void
|
3
|
+
__kernel void sigmoid_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
4
|
// Get the index of the current element to be processed
|
4
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,14 +1,15 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
1
2
|
|
2
|
-
float sigmoid(
|
3
|
+
float sigmoid(<%= c_dtype %> x) {
|
3
4
|
return 1.0f/(1.0f + exp(-x));
|
4
5
|
}
|
5
6
|
|
6
|
-
float sigmoid_grad(
|
7
|
+
float sigmoid_grad(<%= c_dtype %> x, <%= c_dtype %> g) {
|
7
8
|
return g * sigmoid(x) * ( 1.0f - sigmoid(x));
|
8
9
|
}
|
9
10
|
|
10
11
|
// same dimension add floating point op
|
11
|
-
__kernel void
|
12
|
+
__kernel void sigmoid_grad_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
12
13
|
// Get the index of the current element to be processed
|
13
14
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
14
15
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -17,7 +18,7 @@ float sigmoid_grad(float x, float g) {
|
|
17
18
|
}
|
18
19
|
|
19
20
|
// 1D + Scalar floating point add op
|
20
|
-
__kernel void
|
21
|
+
__kernel void sigmoid_grad_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
21
22
|
// Get the index of the current element to be processed
|
22
23
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
23
24
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -30,7 +31,7 @@ float sigmoid_grad(float x, float g) {
|
|
30
31
|
}
|
31
32
|
|
32
33
|
// 1D + Scalar floating point add op broadcast
|
33
|
-
__kernel void
|
34
|
+
__kernel void sigmoid_grad_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
34
35
|
// Get the index of the current element to be processed
|
35
36
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
36
37
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,23 +1,21 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
|
3
|
+
__kernel void sign_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
4
|
// Get the index of the current element to be processed
|
3
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
-
|
7
|
+
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
8
|
+
% if is_floating_point?(dtype)
|
6
9
|
if (isnan(value) || value == 0.0f) {
|
7
10
|
C[globalRow * N + globalCol] = 0.0;
|
8
11
|
} else {
|
9
12
|
C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
|
10
13
|
}
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
if (isnan(value) || value == 0) {
|
19
|
-
C[globalRow * N + globalCol] = 0;
|
20
|
-
} else {
|
21
|
-
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
22
|
-
}
|
14
|
+
% else
|
15
|
+
if (value == 0) {
|
16
|
+
C[globalRow * N + globalCol] = 0;
|
17
|
+
} else {
|
18
|
+
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
19
|
+
}
|
20
|
+
% end
|
23
21
|
}
|
@@ -1,5 +1,6 @@
|
|
1
1
|
|
2
|
-
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void sin_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
4
|
// Get the index of the current element to be processed
|
4
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void softmax_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
<%= c_dtype %> acc = 0.0f;
|
12
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
13
|
+
|
14
|
+
for (int k=0; k<N; k++) {
|
15
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
16
|
+
}
|
17
|
+
|
18
|
+
for (int k=0; k<N; k++) {
|
19
|
+
acc += exp(A[globalRow*N + k] - max);
|
20
|
+
}
|
21
|
+
|
22
|
+
// Store the result
|
23
|
+
for (int k=0; k < N; k++) {
|
24
|
+
C[globalRow*N + k] = exp(A[globalRow*N + k] - max) / acc;
|
25
|
+
}
|
26
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void softmax_grad_<%= dtype %>(const int N,
|
3
|
+
const __global <%= c_dtype %>* A,
|
4
|
+
const __global <%= c_dtype %>* G,
|
5
|
+
__global <%= c_dtype %>* C) {
|
6
|
+
|
7
|
+
// Get the index of the current element to be processed
|
8
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
9
|
+
|
10
|
+
// Compute a single element (loop over K)
|
11
|
+
float acc = 0.0f;
|
12
|
+
float max = FLT_MIN;
|
13
|
+
float row[<%= size %>];
|
14
|
+
float grads[<%= size %>][<%= size %>];
|
15
|
+
|
16
|
+
for (int k=0; k<N; k++) {
|
17
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
+
}
|
19
|
+
|
20
|
+
for (int k=0; k<N; k++) {
|
21
|
+
acc += exp(A[globalRow*N + k] - max);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Store the result
|
25
|
+
for (int k=0; k < N; k++) {
|
26
|
+
row[k] = exp(A[globalRow*N + k] - max) / acc;
|
27
|
+
}
|
28
|
+
|
29
|
+
for (int a=0; a < N; a++) {
|
30
|
+
for(int b=0; b < N; b++) {
|
31
|
+
if (a != b) {
|
32
|
+
grads[a][b] = -row[a] * row[b];
|
33
|
+
} else {
|
34
|
+
grads[a][b] = row[a] * (1.0f - row[a]);
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
for (int k=0; k < N; k++) {
|
40
|
+
float total_grad = 0.0f;
|
41
|
+
for (int a = 0; a < N; a++) {
|
42
|
+
total_grad += grads[a][k] * G[globalRow*N + a];
|
43
|
+
}
|
44
|
+
C[globalRow*N + k] = total_grad;
|
45
|
+
}
|
46
|
+
}
|
@@ -1,5 +1,6 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
1
2
|
|
2
|
-
__kernel void
|
3
|
+
__kernel void sqrt_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
4
|
// Get the index of the current element to be processed
|
4
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,12 +1,6 @@
|
|
1
|
-
|
2
|
-
// Get the index of the current element to be processed
|
3
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
-
|
6
|
-
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
|
7
|
-
}
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
8
2
|
|
9
|
-
__kernel void
|
3
|
+
__kernel void square_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
10
4
|
// Get the index of the current element to be processed
|
11
5
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
6
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
% %w[fp int].product(%w[sub]).each do |dtype, fname|
|
2
1
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
% op = operator_to_c(
|
4
|
-
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname:
|
5
|
-
% end
|
2
|
+
% op = operator_to_c('sub')
|
3
|
+
<%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'sub', dtype: dtype, result_t: c_dtype %>
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tan_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tanh_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void tanh_grad_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
2
3
|
// Get the index of the current element to be processed
|
3
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -1,12 +1,5 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
-
|
6
|
-
C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
7
|
-
}
|
8
|
-
|
9
|
-
__kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
__kernel void where_<%= dtype %>(const int M, const int N, __global const int *PRED, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
10
3
|
// Get the index of the current element to be processed
|
11
4
|
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
5
|
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
@@ -41,20 +41,18 @@ module TensorStream
|
|
41
41
|
@preferred_device = preferred_device
|
42
42
|
@retain = context[:retain] || []
|
43
43
|
@thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
|
44
|
-
|
44
|
+
@context[:_cache][:_cl_buffers] ||= {} if @context[:_cache]
|
45
45
|
@context[:compute_history] = [] if log_intermediates
|
46
46
|
end
|
47
47
|
|
48
48
|
# opencl evaluator main entrypoint
|
49
49
|
def run(tensor, execution_context)
|
50
50
|
_create_opencl_context
|
51
|
-
|
52
|
-
|
51
|
+
create_command_queue
|
53
52
|
read_final_result(complete_eval(tensor, execution_context))
|
54
53
|
end
|
55
54
|
|
56
55
|
def complete_eval(tensor, context)
|
57
|
-
create_command_queue
|
58
56
|
buffer = _run(tensor, context)
|
59
57
|
if buffer.is_a?(Array)
|
60
58
|
buffer = buffer.collect do |b|
|
@@ -66,7 +64,6 @@ module TensorStream
|
|
66
64
|
return buffer if buffer.nil? || buffer.buffer.size.zero?
|
67
65
|
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
|
68
66
|
end
|
69
|
-
|
70
67
|
_opencl_queue.finish
|
71
68
|
buffer
|
72
69
|
end
|
@@ -91,15 +88,18 @@ module TensorStream
|
|
91
88
|
@preferred_device
|
92
89
|
else
|
93
90
|
device, _score, _platform, _index = choose_best_device
|
91
|
+
# puts "using #{device.name}"
|
94
92
|
device
|
95
93
|
end
|
96
94
|
end
|
95
|
+
@context[:cl_device] = opencl_device
|
97
96
|
@context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
|
98
97
|
end
|
99
98
|
|
100
99
|
def choose_best_device
|
101
100
|
@best_device ||= begin
|
102
101
|
devices = OpenCL.platforms.flat_map do |p|
|
102
|
+
|
103
103
|
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
104
104
|
score = 0
|
105
105
|
if d.type.to_s == 'CPU'
|
@@ -108,13 +108,18 @@ module TensorStream
|
|
108
108
|
score += 4
|
109
109
|
end
|
110
110
|
|
111
|
+
if d.platform.name == 'NVIDIA CUDA'
|
112
|
+
score += 1000
|
113
|
+
end
|
114
|
+
|
111
115
|
score += d.max_compute_units
|
116
|
+
score += d.max_clock_frequency
|
112
117
|
|
113
118
|
[d, score, p.name, index]
|
114
119
|
end
|
115
120
|
end
|
121
|
+
devices.sort { |a| a[1] }.reverse.first
|
116
122
|
end
|
117
|
-
devices.max { |a| a[1] }
|
118
123
|
end
|
119
124
|
|
120
125
|
def create_command_queue
|
@@ -137,11 +142,13 @@ module TensorStream
|
|
137
142
|
File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
|
138
143
|
end
|
139
144
|
|
140
|
-
def _cl_program(kernel)
|
141
|
-
|
145
|
+
def _cl_program(kernel, args = {})
|
146
|
+
suffix = args.collect { |k,v| "#{k}.#{v}"}.join('.')
|
147
|
+
@context[:_cache]["_opencl_kernel_#{kernel}.#{suffix}"] ||= begin
|
142
148
|
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
143
149
|
source = File.read(filename)
|
144
|
-
source = OpenclTemplateHelper.new(source).generate
|
150
|
+
source = OpenclTemplateHelper.new(source).generate(args)
|
151
|
+
File.write("/tmp/#{kernel}.#{suffix}.cl", source)
|
145
152
|
program = _opencl_context.create_program_with_source(source)
|
146
153
|
program.build
|
147
154
|
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
@@ -152,7 +159,9 @@ module TensorStream
|
|
152
159
|
|
153
160
|
def _run(tensor, execution_context)
|
154
161
|
return tensor if tensor.is_a?(OpenCLBuffer)
|
155
|
-
|
162
|
+
if tensor.is_a?(Array) && tensor.size > 0 && tensor[0].is_a?(Tensor)
|
163
|
+
return tensor.map { |t| _run(t, execution_context) }
|
164
|
+
end
|
156
165
|
|
157
166
|
return tensor if retain.include?(tensor) # if var is in retain don't eval to value
|
158
167
|
|
@@ -180,10 +189,11 @@ module TensorStream
|
|
180
189
|
|
181
190
|
def eval_operation(tensor, child_context)
|
182
191
|
return @context[tensor.name] if @context.key?(tensor.name)
|
183
|
-
|
192
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
|
193
|
+
return @context[cache_key] if @context.key?(cache_key)
|
184
194
|
a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
|
185
195
|
b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
|
186
|
-
|
196
|
+
# puts tensor.name
|
187
197
|
case tensor.operation
|
188
198
|
when :concat
|
189
199
|
input_a = read_final_result(complete_eval(a, child_context))
|
@@ -238,7 +248,6 @@ module TensorStream
|
|
238
248
|
when :assign_add
|
239
249
|
a = _run(a, child_context)
|
240
250
|
b = _run(b, child_context)
|
241
|
-
|
242
251
|
value = execute_2_operand_func('add', tensor, a, b, child_context)
|
243
252
|
assign_var(tensor, value, child_context)
|
244
253
|
when :assign_sub
|
@@ -290,8 +299,8 @@ module TensorStream
|
|
290
299
|
raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
|
291
300
|
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
292
301
|
|
293
|
-
dtype =
|
294
|
-
a, b =
|
302
|
+
dtype = tensor.data_type
|
303
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
295
304
|
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
296
305
|
|
297
306
|
cl_m = OpenCL::Int1.new(m)
|
@@ -301,7 +310,7 @@ module TensorStream
|
|
301
310
|
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
302
311
|
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
303
312
|
|
304
|
-
output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
313
|
+
output_buffer.op = _cl_program('gemm', dtype: dtype).send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
305
314
|
output_buffer
|
306
315
|
when :mul
|
307
316
|
execute_2_operand_func('mul', tensor, a, b, child_context)
|
@@ -311,14 +320,12 @@ module TensorStream
|
|
311
320
|
a = _run(a, child_context)
|
312
321
|
if a.data_type != tensor.data_type
|
313
322
|
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
314
|
-
s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
|
315
|
-
t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
316
323
|
m, n = a.shape
|
317
324
|
cl_m = OpenCL::Int1.new(m || 1)
|
318
325
|
cl_n = OpenCL::Int1.new(n || 1)
|
319
326
|
work_group = [m || 1, n || 1]
|
320
327
|
|
321
|
-
buffer.op = _cl_program("cast").
|
328
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: tensor.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
322
329
|
buffer
|
323
330
|
else
|
324
331
|
a
|
@@ -355,6 +362,34 @@ module TensorStream
|
|
355
362
|
execute_func('log1p', tensor, a, child_context)
|
356
363
|
when :round
|
357
364
|
execute_func('round', tensor, a, child_context)
|
365
|
+
when :softmax
|
366
|
+
a = _run(a, child_context)
|
367
|
+
event_wait_list = [a.op].compact
|
368
|
+
dtype = tensor.data_type
|
369
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
370
|
+
|
371
|
+
m, n = a.shape
|
372
|
+
work_group = [m]
|
373
|
+
n = m if n.nil?
|
374
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
375
|
+
|
376
|
+
event = _cl_program("softmax", dtype: dtype).send(:"softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
377
|
+
output_buffer.op = event
|
378
|
+
output_buffer
|
379
|
+
when :softmax_grad
|
380
|
+
a = _run(a, child_context)
|
381
|
+
grad = _run(b, child_context)
|
382
|
+
event_wait_list = [a.op].compact
|
383
|
+
dtype = tensor.data_type
|
384
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
385
|
+
|
386
|
+
m, n = a.shape
|
387
|
+
work_group = [m]
|
388
|
+
n = m if n.nil?
|
389
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
390
|
+
event = _cl_program("softmax_grad", dtype: dtype, size: n).send(:"softmax_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, grad.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
391
|
+
output_buffer.op = event
|
392
|
+
output_buffer
|
358
393
|
when :sigmoid_grad
|
359
394
|
execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
|
360
395
|
when :truncate
|
@@ -381,6 +416,14 @@ module TensorStream
|
|
381
416
|
end
|
382
417
|
end
|
383
418
|
end
|
419
|
+
when :check_numerics
|
420
|
+
a = complete_eval(a, child_context)
|
421
|
+
name = tensor.options[:name]
|
422
|
+
|
423
|
+
a.buffer.each do |item|
|
424
|
+
raise "#{name} Invalid Argument" if item.nan? || item.infinite?
|
425
|
+
end
|
426
|
+
a
|
384
427
|
when :zeros, :ones, :zeros_like, :ones_like
|
385
428
|
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
386
429
|
_run(a, child_context).shape
|
@@ -551,6 +594,7 @@ module TensorStream
|
|
551
594
|
else
|
552
595
|
raise "unknown op #{tensor.operation}"
|
553
596
|
end.tap do |result|
|
597
|
+
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
554
598
|
if tensor.breakpoint
|
555
599
|
a = read_final_result(complete_eval(a, child_context))
|
556
600
|
b = read_final_result(complete_eval(b, child_context))
|
@@ -568,11 +612,13 @@ module TensorStream
|
|
568
612
|
value: result
|
569
613
|
}
|
570
614
|
end
|
615
|
+
@context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
|
571
616
|
@context[tensor.name] = result
|
572
617
|
end
|
573
618
|
rescue EvaluatorExcecutionException => e
|
574
619
|
raise e
|
575
620
|
rescue StandardError => e
|
621
|
+
_opencl_queue.finish # dump queue
|
576
622
|
puts e.message
|
577
623
|
puts e.backtrace.join("\n")
|
578
624
|
|
@@ -612,8 +658,12 @@ module TensorStream
|
|
612
658
|
def assign_var(tensor, b, child_context)
|
613
659
|
assign = tensor.items[0] || tensor
|
614
660
|
buffer = complete_eval(b, child_context)
|
661
|
+
|
615
662
|
if assign.buffer
|
616
|
-
|
663
|
+
buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
664
|
+
if assign.buffer.cl_buffer != buffer.cl_buffer
|
665
|
+
assign.buffer.op = _opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
|
666
|
+
end
|
617
667
|
else
|
618
668
|
assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
|
619
669
|
end
|
@@ -624,8 +674,8 @@ module TensorStream
|
|
624
674
|
def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
|
625
675
|
a = _run(input_a, child_context)
|
626
676
|
b = _run(input_b, child_context)
|
627
|
-
a, b =
|
628
|
-
dtype =
|
677
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
678
|
+
dtype = tensor.data_type
|
629
679
|
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
630
680
|
|
631
681
|
output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
|
@@ -646,9 +696,9 @@ module TensorStream
|
|
646
696
|
else
|
647
697
|
raise "rank > 2 not supported!"
|
648
698
|
end
|
649
|
-
_cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
699
|
+
_cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
650
700
|
else
|
651
|
-
_cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
701
|
+
_cl_program("#{prog_name || op_name}", dtype: dtype).send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
652
702
|
end
|
653
703
|
|
654
704
|
output_buffer.op = event
|
@@ -660,8 +710,8 @@ module TensorStream
|
|
660
710
|
a = _run(input_a, child_context)
|
661
711
|
b = _run(input_b, child_context)
|
662
712
|
|
663
|
-
a, b =
|
664
|
-
dtype =
|
713
|
+
a, b = auto_type_cast(a, b, name: "#{tensor.name}/cast_#{a.name}_#{b.data_type}")
|
714
|
+
dtype = tensor.data_type
|
665
715
|
|
666
716
|
output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
|
667
717
|
|
@@ -671,14 +721,14 @@ module TensorStream
|
|
671
721
|
cl_n = OpenCL::Int1.new(n || 1)
|
672
722
|
|
673
723
|
event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
|
674
|
-
output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
724
|
+
output_buffer.op = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
675
725
|
output_buffer
|
676
726
|
end
|
677
727
|
|
678
728
|
def execute_func(op_name, tensor, a, child_context)
|
679
729
|
a = _run(a, child_context)
|
680
|
-
event_wait_list = [a.op].compact
|
681
|
-
dtype =
|
730
|
+
event_wait_list = [a.op].compact
|
731
|
+
dtype = tensor.data_type
|
682
732
|
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
683
733
|
|
684
734
|
m, n = a.shape
|
@@ -686,43 +736,37 @@ module TensorStream
|
|
686
736
|
cl_m = OpenCL::Int1.new(m || 1)
|
687
737
|
cl_n = OpenCL::Int1.new(n || 1)
|
688
738
|
|
689
|
-
event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
739
|
+
event = _cl_program("#{op_name}", dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
690
740
|
output_buffer.op = event
|
691
741
|
output_buffer
|
692
742
|
end
|
693
743
|
|
694
|
-
def
|
744
|
+
def auto_type_cast(a, b, name: nil)
|
695
745
|
return [a, b] if a.data_type == b.data_type
|
696
746
|
m, n = b.shape
|
697
747
|
work_group = [m || 1, n || 1]
|
698
|
-
|
699
|
-
|
700
|
-
if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
|
701
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
702
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
748
|
+
event_wait_list = [b.op].compact
|
749
|
+
buffer = _create_result_buffer(b.data_type, b.shape, name)
|
703
750
|
|
704
|
-
|
705
|
-
|
706
|
-
end
|
707
|
-
elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
|
708
|
-
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
|
709
|
-
cl_m = OpenCL::Int1.new(m || 1)
|
710
|
-
cl_n = OpenCL::Int1.new(n || 1)
|
711
|
-
_cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
|
712
|
-
return [a, buffer]
|
713
|
-
end
|
714
|
-
end
|
751
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
752
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
715
753
|
|
716
|
-
|
754
|
+
buffer.op = _cl_program("cast", source_dt: a.data_type, target_dt: b.data_type).cast(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
755
|
+
[a, buffer]
|
717
756
|
end
|
718
757
|
|
719
|
-
def
|
720
|
-
|
758
|
+
def type_cast(source, data_type, name: nil)
|
759
|
+
return source if source.data_type == data_type
|
760
|
+
m, n = source.shape
|
761
|
+
work_group = [m || 1, n || 1]
|
762
|
+
event_wait_list = [source.op].compact
|
763
|
+
buffer = _create_result_buffer(data_type, source.shape, name)
|
721
764
|
|
722
|
-
|
765
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
766
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
723
767
|
|
724
|
-
|
725
|
-
|
768
|
+
buffer.op = _cl_program("cast", source_dt: source.data_type, target_dt: data_type).cast(_opencl_queue, work_group, cl_m, cl_n, source.cl_buffer, buffer.cl_buffer, event_wait_list: event_wait_list)
|
769
|
+
buffer
|
726
770
|
end
|
727
771
|
|
728
772
|
def wrap_opencl(tensor, data_type: nil, name: nil)
|
@@ -786,11 +830,16 @@ module TensorStream
|
|
786
830
|
end
|
787
831
|
|
788
832
|
def allocate_narray_for_type(data_type, narray_size)
|
789
|
-
|
833
|
+
case data_type
|
834
|
+
when :float, :float32
|
790
835
|
NArray.sfloat(narray_size)
|
791
|
-
|
836
|
+
when :float64
|
837
|
+
NArray.float(narray_size)
|
838
|
+
when :int, :int32, :int64
|
792
839
|
NArray.int(narray_size)
|
793
|
-
|
840
|
+
when :int16
|
841
|
+
NArray.sint(narray_size)
|
842
|
+
when :boolean
|
794
843
|
NArray.int(narray_size)
|
795
844
|
else
|
796
845
|
raise "unsupported type #{data_type}"
|
@@ -798,7 +847,7 @@ module TensorStream
|
|
798
847
|
end
|
799
848
|
|
800
849
|
def _create_result_buffer(data_type, shape, name)
|
801
|
-
@context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
850
|
+
@context[:_cache][:_cl_buffers]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
802
851
|
size = shape.empty? ? 1 : shape.reduce(:*)
|
803
852
|
buffer = allocate_narray_for_type(data_type, size)
|
804
853
|
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
@@ -843,7 +892,8 @@ module TensorStream
|
|
843
892
|
input = complete_eval(a, child_context)
|
844
893
|
axis = read_final_result(complete_eval(b, child_context))
|
845
894
|
if axis.nil?
|
846
|
-
|
895
|
+
red = input.buffer.send(func)
|
896
|
+
convert_to_opencl(red, [], data_type: tensor.data_type, name: tensor.name)
|
847
897
|
else
|
848
898
|
return input if input.shape.empty?
|
849
899
|
value = input.buffer.reshape(*input.shape.reverse)
|