tensor_stream 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +2 -1
- data/CHANGELOG.md +5 -0
- data/README.md +28 -1
- data/benchmark/benchmark.rb +129 -0
- data/lib/tensor_stream.rb +7 -4
- data/lib/tensor_stream/evaluator/buffer.rb +10 -0
- data/lib/tensor_stream/evaluator/evaluator.rb +1 -0
- data/lib/tensor_stream/evaluator/kernels/_bool_operand.cl +45 -0
- data/lib/tensor_stream/evaluator/kernels/_operand.cl +45 -0
- data/lib/tensor_stream/evaluator/kernels/abs.cl +16 -0
- data/lib/tensor_stream/evaluator/kernels/add.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/argmax.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/argmin.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/cast.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/cond.cl.erb +5 -0
- data/lib/tensor_stream/evaluator/kernels/cos.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/div.cl.erb +5 -0
- data/lib/tensor_stream/evaluator/kernels/exp.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/gemm.cl +63 -0
- data/lib/tensor_stream/evaluator/kernels/log.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/log1p.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/max.cl +91 -0
- data/lib/tensor_stream/evaluator/kernels/mul.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/negate.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/pow.cl +130 -0
- data/lib/tensor_stream/evaluator/kernels/reciprocal.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/round.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/sigmoid.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/sigmoid_grad.cl +54 -0
- data/lib/tensor_stream/evaluator/kernels/sign.cl +23 -0
- data/lib/tensor_stream/evaluator/kernels/sin.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/sqrt.cl +8 -0
- data/lib/tensor_stream/evaluator/kernels/square.cl +15 -0
- data/lib/tensor_stream/evaluator/kernels/sub.cl +5 -0
- data/lib/tensor_stream/evaluator/kernels/tan.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/tanh.cl +7 -0
- data/lib/tensor_stream/evaluator/kernels/tanh_grad.cl +6 -0
- data/lib/tensor_stream/evaluator/kernels/where.cl +15 -0
- data/lib/tensor_stream/evaluator/opencl_buffer.rb +30 -0
- data/lib/tensor_stream/evaluator/opencl_evaluator.rb +1095 -0
- data/lib/tensor_stream/evaluator/opencl_template_helper.rb +58 -0
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +27 -0
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +20 -31
- data/lib/tensor_stream/graph.rb +4 -2
- data/lib/tensor_stream/math_gradients.rb +3 -0
- data/lib/tensor_stream/operation.rb +29 -2
- data/lib/tensor_stream/ops.rb +14 -2
- data/lib/tensor_stream/placeholder.rb +1 -1
- data/lib/tensor_stream/session.rb +10 -3
- data/lib/tensor_stream/tensor_shape.rb +1 -1
- data/lib/tensor_stream/train/saver.rb +1 -1
- data/lib/tensor_stream/variable.rb +7 -1
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/logistic_regression.rb +2 -1
- data/samples/nearest_neighbor.rb +54 -0
- data/tensor_stream.gemspec +3 -1
- metadata +107 -28
@@ -0,0 +1,23 @@
|
|
1
|
+
__kernel void sign_fp(const int M, const int N, __global const float *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
float value = A[globalRow * N + globalCol];
|
6
|
+
if (isnan(value) || value == 0.0f) {
|
7
|
+
C[globalRow * N + globalCol] = 0.0;
|
8
|
+
} else {
|
9
|
+
C[globalRow * N + globalCol] = value < 0 ? -1.0 : 1.0;
|
10
|
+
}
|
11
|
+
}
|
12
|
+
|
13
|
+
__kernel void sign_int(const int M, const int N, __global const int *A, __global int *C) {
|
14
|
+
// Get the index of the current element to be processed
|
15
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
16
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
17
|
+
float value = A[globalRow * N + globalCol];
|
18
|
+
if (isnan(value) || value == 0) {
|
19
|
+
C[globalRow * N + globalCol] = 0;
|
20
|
+
} else {
|
21
|
+
C[globalRow * N + globalCol] = value < 0 ? -1 : 1;
|
22
|
+
}
|
23
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
|
2
|
+
__kernel void sin_fp(const int M, const int N, __global const float *A, __global float *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = sin(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
|
2
|
+
__kernel void sqrt_fp(const int M, const int N, __global const float *A, __global float *C) {
|
3
|
+
// Get the index of the current element to be processed
|
4
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
5
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
6
|
+
|
7
|
+
C[globalRow * N + globalCol] = sqrt(A[globalRow * N + globalCol]);
|
8
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void square_fp(const int M, const int N, __global const float *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void square_int(const int M, const int N, __global const int *A, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] * A[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
__kernel void tan_fp(const int M, const int N, __global const float *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = tan(A[globalRow * N + globalCol]);
|
7
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
__kernel void tanh_fp(const int M, const int N, __global const float *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = tanh(A[globalRow * N + globalCol]);
|
7
|
+
}
|
@@ -0,0 +1,6 @@
|
|
1
|
+
__kernel void tanh_grad_fp(const int M, const int N, __global const float *A, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
C[globalRow * N + globalCol] = 1 - tanh(A[globalRow * N + globalCol]) * tanh(A[globalRow * N + globalCol]);
|
6
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
__kernel void where_fp(const int M, const int N, __global const int *PRED, __global const float *A, __global const float *B, __global float *C) {
|
2
|
+
// Get the index of the current element to be processed
|
3
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
4
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
|
6
|
+
C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
7
|
+
}
|
8
|
+
|
9
|
+
__kernel void where_int(const int M, const int N, __global const int *PRED, __global const int *A, __global const int *B, __global int *C) {
|
10
|
+
// Get the index of the current element to be processed
|
11
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
12
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
|
14
|
+
C[globalRow * N + globalCol] = PRED[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
15
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module TensorStream
|
2
|
+
class OpenCLBuffer < Buffer
|
3
|
+
include ArrayOpsHelper
|
4
|
+
|
5
|
+
attr_accessor :data_type, :shape, :buffer, :cl_buffer, :op
|
6
|
+
|
7
|
+
def initialize(data_type: , shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
8
|
+
@data_type = data_type
|
9
|
+
@shape = shape
|
10
|
+
@buffer = buffer
|
11
|
+
@cl_buffer = cl_buffer
|
12
|
+
@name = name
|
13
|
+
@op = op
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_ruby
|
17
|
+
return [] if buffer.empty?
|
18
|
+
if shape.empty?
|
19
|
+
return buffer[0] != 0 if data_type == :boolean
|
20
|
+
return buffer[0]
|
21
|
+
end
|
22
|
+
|
23
|
+
result = buffer.reshape(*shape.reverse).to_a
|
24
|
+
if data_type == :boolean
|
25
|
+
result = process_function_op(result, ->(a, _b) { a != 0 })
|
26
|
+
end
|
27
|
+
result
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,1095 @@
|
|
1
|
+
require 'tensor_stream/evaluator/operation_helpers/random_gaussian'
|
2
|
+
require 'tensor_stream/evaluator/operation_helpers/array_ops_helper'
|
3
|
+
require 'tensor_stream/evaluator/operation_helpers/math_helper'
|
4
|
+
require 'tensor_stream/evaluator/opencl_buffer'
|
5
|
+
require 'tensor_stream/evaluator/opencl_template_helper'
|
6
|
+
require 'distribution'
|
7
|
+
require 'opencl_ruby_ffi'
|
8
|
+
require 'narray_ffi'
|
9
|
+
|
10
|
+
module TensorStream
|
11
|
+
module Evaluator
|
12
|
+
class FullEvalNotPossible < RuntimeError
|
13
|
+
end
|
14
|
+
|
15
|
+
# Errors during graph evaluation
|
16
|
+
class EvaluatorExcecutionException < RuntimeError
|
17
|
+
attr_reader :tensor
|
18
|
+
|
19
|
+
def initialize(exception, tensor)
|
20
|
+
@exception = exception
|
21
|
+
@tensor = tensor
|
22
|
+
end
|
23
|
+
|
24
|
+
def wrapped_exception
|
25
|
+
@exception
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
## PURE ruby evaluator used for testing and development
|
30
|
+
class OpenclEvaluator
|
31
|
+
attr_accessor :retain
|
32
|
+
|
33
|
+
include TensorStream::OpHelper
|
34
|
+
include TensorStream::ArrayOpsHelper
|
35
|
+
include TensorStream::MathHelper
|
36
|
+
|
37
|
+
def initialize(session, context, thread_pool: nil, log_intermediates: false, preferred_device: nil)
|
38
|
+
@session = session
|
39
|
+
@context = context
|
40
|
+
@log_intermediates = log_intermediates
|
41
|
+
@preferred_device = preferred_device
|
42
|
+
@retain = context[:retain] || []
|
43
|
+
@thread_pool = thread_pool || Concurrent::ImmediateExecutor.new
|
44
|
+
|
45
|
+
@context[:compute_history] = [] if log_intermediates
|
46
|
+
end
|
47
|
+
|
48
|
+
# opencl evaluator main entrypoint
|
49
|
+
def run(tensor, execution_context)
|
50
|
+
_create_opencl_context
|
51
|
+
# _prepare_kernels
|
52
|
+
|
53
|
+
read_final_result(complete_eval(tensor, execution_context))
|
54
|
+
end
|
55
|
+
|
56
|
+
def complete_eval(tensor, context)
|
57
|
+
create_command_queue
|
58
|
+
buffer = _run(tensor, context)
|
59
|
+
if buffer.is_a?(Array)
|
60
|
+
buffer = buffer.collect do |b|
|
61
|
+
next b if b.buffer.size.zero?
|
62
|
+
_opencl_queue.enqueue_read_buffer(b.cl_buffer, b.buffer, event_wait_list: [b.op].compact)
|
63
|
+
b
|
64
|
+
end
|
65
|
+
else
|
66
|
+
return buffer if buffer.nil? || buffer.buffer.size.zero?
|
67
|
+
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
|
68
|
+
end
|
69
|
+
|
70
|
+
_opencl_queue.finish
|
71
|
+
buffer
|
72
|
+
end
|
73
|
+
|
74
|
+
def opencl_device
|
75
|
+
@context[:_cache][:_opencl_device]
|
76
|
+
end
|
77
|
+
|
78
|
+
protected
|
79
|
+
|
80
|
+
# read result from opencl and convert to ruby
|
81
|
+
def read_final_result(buffer)
|
82
|
+
return buffer.map { |b| read_final_result(b) } if buffer.is_a?(Array)
|
83
|
+
return nil if buffer.nil?
|
84
|
+
|
85
|
+
buffer.to_ruby
|
86
|
+
end
|
87
|
+
|
88
|
+
def _create_opencl_context
|
89
|
+
@context[:_cache][:_opencl_device] ||= begin
|
90
|
+
if @preferred_device
|
91
|
+
@preferred_device
|
92
|
+
else
|
93
|
+
device, _score, _platform, _index = choose_best_device
|
94
|
+
device
|
95
|
+
end
|
96
|
+
end
|
97
|
+
@context[:_cache][:_opencl_context] ||= OpenCL.create_context(opencl_device)
|
98
|
+
end
|
99
|
+
|
100
|
+
def choose_best_device
|
101
|
+
@best_device ||= begin
|
102
|
+
devices = OpenCL.platforms.flat_map do |p|
|
103
|
+
p.devices.select { |d| d.available > 0 }.each_with_index.collect do |d, index|
|
104
|
+
score = 0
|
105
|
+
if d.type.to_s == 'CPU'
|
106
|
+
score += 1
|
107
|
+
elsif d.type.to_s == 'GPU'
|
108
|
+
score += 4
|
109
|
+
end
|
110
|
+
|
111
|
+
score += d.max_compute_units
|
112
|
+
|
113
|
+
[d, score, p.name, index]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
devices.max { |a| a[1] }
|
118
|
+
end
|
119
|
+
|
120
|
+
def create_command_queue
|
121
|
+
supported_proprties = opencl_device.queue_properties.names
|
122
|
+
properties = []
|
123
|
+
properties << OpenCL::CommandQueue::PROFILING_ENABLE if supported_proprties.include?('PROFILING_ENABLE')
|
124
|
+
properties << OpenCL::CommandQueue::OUT_OF_ORDER_EXEC_MODE_ENABLE if supported_proprties.include?('OUT_OF_ORDER_EXEC_MODE_ENABLE')
|
125
|
+
@context[:_cache][:_opencl_queue] ||= _opencl_context.create_command_queue(opencl_device, properties: properties)
|
126
|
+
end
|
127
|
+
|
128
|
+
def _opencl_context
|
129
|
+
@context[:_cache][:_opencl_context]
|
130
|
+
end
|
131
|
+
|
132
|
+
def _opencl_queue
|
133
|
+
@context[:_cache][:_opencl_queue]
|
134
|
+
end
|
135
|
+
|
136
|
+
def cl_template_path(kernel, extension)
|
137
|
+
File.join(File.dirname(__FILE__), 'kernels', "#{kernel}.#{extension}")
|
138
|
+
end
|
139
|
+
|
140
|
+
def _cl_program(kernel)
|
141
|
+
@context[:_cache]["_opencl_kernel_#{kernel}"] ||= begin
|
142
|
+
filename = %w[cl.erb cl].map { |ext| cl_template_path(kernel, ext) }.find { |n| File.exist?(n) }
|
143
|
+
source = File.read(filename)
|
144
|
+
source = OpenclTemplateHelper.new(source).generate
|
145
|
+
program = _opencl_context.create_program_with_source(source)
|
146
|
+
program.build
|
147
|
+
rescue OpenCL::Error::BUILD_PROGRAM_FAILURE => e
|
148
|
+
puts "OpenCL Compile error: #{program.build_log}"
|
149
|
+
raise e
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def _run(tensor, execution_context)
|
154
|
+
return tensor if tensor.is_a?(OpenCLBuffer)
|
155
|
+
return tensor.map { |t| _run(t, execution_context) } if tensor.is_a?(Array)
|
156
|
+
|
157
|
+
return tensor if retain.include?(tensor) # if var is in retain don't eval to value
|
158
|
+
|
159
|
+
tensor = tensor.call if tensor.is_a?(Proc)
|
160
|
+
|
161
|
+
child_context = execution_context.dup
|
162
|
+
res = if tensor.is_a?(Operation)
|
163
|
+
eval_operation(tensor, child_context)
|
164
|
+
elsif tensor.is_a?(Variable)
|
165
|
+
eval_variable(tensor, child_context)
|
166
|
+
elsif tensor.is_a?(Placeholder)
|
167
|
+
resolve_placeholder(tensor, child_context)
|
168
|
+
else
|
169
|
+
eval_tensor(tensor, child_context)
|
170
|
+
end
|
171
|
+
execution_context.deep_merge!(returns: child_context[:returns])
|
172
|
+
res
|
173
|
+
end
|
174
|
+
|
175
|
+
def eval_variable(tensor, child_context)
|
176
|
+
raise "variable #{tensor.name} not initalized" if tensor.value.nil? && (tensor.buffer.nil? || !tensor.buffer.dirty)
|
177
|
+
tensor.buffer = wrap_opencl(tensor, name: tensor.name) if tensor.buffer.nil?
|
178
|
+
tensor.buffer
|
179
|
+
end
|
180
|
+
|
181
|
+
def eval_operation(tensor, child_context)
|
182
|
+
return @context[tensor.name] if @context.key?(tensor.name)
|
183
|
+
|
184
|
+
a = resolve_placeholder(tensor.items[0], child_context) if tensor.items && tensor.items[0]
|
185
|
+
b = resolve_placeholder(tensor.items[1], child_context) if tensor.items && tensor.items[1]
|
186
|
+
|
187
|
+
case tensor.operation
|
188
|
+
when :concat
|
189
|
+
input_a = read_final_result(complete_eval(a, child_context))
|
190
|
+
arr = concat_array(input_a, tensor.options[:axis])
|
191
|
+
convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
|
192
|
+
when :cond
|
193
|
+
pred = complete_eval(tensor.options[:pred], child_context)
|
194
|
+
a = _run(a, child_context)
|
195
|
+
b = _run(b, child_context)
|
196
|
+
|
197
|
+
if all_true?(pred.buffer)
|
198
|
+
a
|
199
|
+
else
|
200
|
+
b
|
201
|
+
end
|
202
|
+
when :identity
|
203
|
+
_run(a, child_context)
|
204
|
+
when :eye
|
205
|
+
rows = complete_eval(a, child_context)
|
206
|
+
columns = complete_eval(b, child_context)
|
207
|
+
shape = [rows.buffer[0], columns.buffer[0]]
|
208
|
+
eye_arr = Array.new(rows.buffer[0]) do |i|
|
209
|
+
Array.new(columns.buffer[0]) do |col|
|
210
|
+
if fp_type?(tensor.data_type)
|
211
|
+
i == col ? 1.0 : 0.0
|
212
|
+
else
|
213
|
+
i == col ? 1 : 0
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
convert_to_opencl(eye_arr.flatten, shape, data_type: tensor.data_type, name: tensor.name)
|
219
|
+
when :pad
|
220
|
+
a = read_final_result(complete_eval(a, child_context))
|
221
|
+
p = read_final_result(complete_eval(tensor.options[:paddings], child_context))
|
222
|
+
|
223
|
+
padding = arr_pad(a, p, tensor.data_type)
|
224
|
+
convert_to_opencl(padding.flatten, shape_eval(padding), data_type: tensor.data_type, name: tensor.name)
|
225
|
+
when :tile
|
226
|
+
input = read_final_result(complete_eval(a, child_context))
|
227
|
+
multiples = read_final_result(complete_eval(b, child_context))
|
228
|
+
|
229
|
+
rank = get_rank(input)
|
230
|
+
raise '1D or higher tensor required' if rank.zero?
|
231
|
+
raise "invalid multiple size passed #{rank} != #{multiples.size}" if rank != multiples.size
|
232
|
+
|
233
|
+
tile = tile_arr(input, 0, multiples)
|
234
|
+
arr = tile.nil? ? [] : tile
|
235
|
+
convert_to_opencl(arr.flatten, shape_eval(arr), data_type: tensor.data_type, name: tensor.name)
|
236
|
+
when :assign
|
237
|
+
assign_var(tensor, b, child_context)
|
238
|
+
when :assign_add
|
239
|
+
a = _run(a, child_context)
|
240
|
+
b = _run(b, child_context)
|
241
|
+
|
242
|
+
value = execute_2_operand_func('add', tensor, a, b, child_context)
|
243
|
+
assign_var(tensor, value, child_context)
|
244
|
+
when :assign_sub
|
245
|
+
a = _run(a, child_context)
|
246
|
+
b = _run(b, child_context)
|
247
|
+
|
248
|
+
value = execute_2_operand_func('sub', tensor, a, b, child_context)
|
249
|
+
assign_var(tensor, value, child_context)
|
250
|
+
when :less
|
251
|
+
execute_2_operand_func('less', tensor, a, b, child_context, 'cond')
|
252
|
+
when :less_equal
|
253
|
+
execute_2_operand_func('less_equal', tensor, a, b, child_context, 'cond')
|
254
|
+
when :greater
|
255
|
+
execute_2_operand_func('greater', tensor, a, b, child_context, 'cond')
|
256
|
+
when :greater_equal
|
257
|
+
execute_2_operand_func('greater_equal', tensor, a, b, child_context, 'cond')
|
258
|
+
when :equal
|
259
|
+
execute_2_operand_func('equal', tensor, a, b, child_context, 'cond')
|
260
|
+
when :not_equal
|
261
|
+
execute_2_operand_func('not_equal', tensor, a, b, child_context, 'cond')
|
262
|
+
when :logical_and
|
263
|
+
execute_2_operand_func('logical_and', tensor, a, b, child_context, 'cond')
|
264
|
+
when :where
|
265
|
+
pred = tensor.options[:pred]
|
266
|
+
execute_cond_func('where', tensor, pred, a, b, child_context)
|
267
|
+
when :max
|
268
|
+
execute_2_operand_func('max', tensor, a, b, child_context)
|
269
|
+
when :add
|
270
|
+
execute_2_operand_func('add', tensor, a, b, child_context)
|
271
|
+
when :div
|
272
|
+
execute_2_operand_func('div', tensor, a, b, child_context)
|
273
|
+
when :sub
|
274
|
+
execute_2_operand_func('sub', tensor, a, b, child_context)
|
275
|
+
when :matmul
|
276
|
+
a = _run(a, child_context)
|
277
|
+
b = _run(b, child_context)
|
278
|
+
|
279
|
+
m = a.shape[0]
|
280
|
+
n = b.shape[1]
|
281
|
+
v = b.shape[0]
|
282
|
+
k = a.shape[1]
|
283
|
+
|
284
|
+
m, k = [a.shape[1], a.shape[0]] if tensor.options[:transpose_a]
|
285
|
+
n, v = [b.shape[0], b.shape[1]] if tensor.options[:transpose_b]
|
286
|
+
|
287
|
+
result_shape = [m, n]
|
288
|
+
|
289
|
+
raise "#{tensor.items[0].name} rank must be greater than 1" if a.shape.size < 2
|
290
|
+
raise "#{tensor.items[1].name} rank must be greater than 1" if b.shape.size < 2
|
291
|
+
raise "incompatible shape sizes for matrix multiplication (#{a.shape[1]} != #{b.shape[0]}) #{a.shape} vs #{b.shape}" if k != v
|
292
|
+
|
293
|
+
dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
294
|
+
a, b = type_cast(a, b)
|
295
|
+
output_buffer = _create_result_buffer(a.data_type, result_shape, tensor.name)
|
296
|
+
|
297
|
+
cl_m = OpenCL::Int1.new(m)
|
298
|
+
cl_n = OpenCL::Int1.new(n)
|
299
|
+
cl_k = OpenCL::Int1.new(k)
|
300
|
+
|
301
|
+
transpose_a = OpenCL::Int1.new(tensor.options[:transpose_a] ? 1 : 0)
|
302
|
+
transpose_b = OpenCL::Int1.new(tensor.options[:transpose_b] ? 1 : 0)
|
303
|
+
|
304
|
+
output_buffer.op = _cl_program('gemm').send(:"gemm_#{dtype}", _opencl_queue, result_shape, cl_m, cl_n, cl_k, transpose_a, transpose_b, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer)
|
305
|
+
output_buffer
|
306
|
+
when :mul
|
307
|
+
execute_2_operand_func('mul', tensor, a, b, child_context)
|
308
|
+
when :pow
|
309
|
+
execute_2_operand_func('pow', tensor, a, b, child_context)
|
310
|
+
when :cast
|
311
|
+
a = _run(a, child_context)
|
312
|
+
if a.data_type != tensor.data_type
|
313
|
+
buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
314
|
+
s_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type) ? 'fp' : 'int'
|
315
|
+
t_dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
316
|
+
m, n = a.shape
|
317
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
318
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
319
|
+
work_group = [m || 1, n || 1]
|
320
|
+
|
321
|
+
buffer.op = _cl_program("cast").send(:"cast_#{s_dtype}_#{t_dtype}",_opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, buffer.cl_buffer)
|
322
|
+
buffer
|
323
|
+
else
|
324
|
+
a
|
325
|
+
end
|
326
|
+
when :sign
|
327
|
+
execute_func('sign', tensor, a, child_context)
|
328
|
+
when :exp
|
329
|
+
execute_func('exp', tensor, a, child_context)
|
330
|
+
when :log
|
331
|
+
execute_func('log', tensor, a, child_context)
|
332
|
+
when :sin
|
333
|
+
execute_func('sin', tensor, a, child_context)
|
334
|
+
when :tan
|
335
|
+
execute_func('tan', tensor, a, child_context)
|
336
|
+
when :cos
|
337
|
+
execute_func('cos', tensor, a, child_context)
|
338
|
+
when :abs
|
339
|
+
execute_func('abs', tensor, a, child_context)
|
340
|
+
when :sqrt
|
341
|
+
execute_func('sqrt', tensor, a, child_context)
|
342
|
+
when :negate
|
343
|
+
execute_func('negate', tensor, a, child_context)
|
344
|
+
when :square
|
345
|
+
execute_func('square', tensor, a, child_context)
|
346
|
+
when :reciprocal
|
347
|
+
execute_func('reciprocal', tensor, a, child_context)
|
348
|
+
when :tanh
|
349
|
+
execute_func('tanh', tensor, a, child_context)
|
350
|
+
when :tanh_grad
|
351
|
+
execute_func('tanh_grad', tensor, a, child_context)
|
352
|
+
when :sigmoid
|
353
|
+
execute_func('sigmoid', tensor, a, child_context)
|
354
|
+
when :log1p
|
355
|
+
execute_func('log1p', tensor, a, child_context)
|
356
|
+
when :round
|
357
|
+
execute_func('round', tensor, a, child_context)
|
358
|
+
when :sigmoid_grad
|
359
|
+
execute_2_operand_func('sigmoid_grad', tensor, a, b, child_context)
|
360
|
+
when :truncate
|
361
|
+
a = _run(a, child_context)
|
362
|
+
b = _run(b, child_context)
|
363
|
+
|
364
|
+
if a.shape.size.zero?
|
365
|
+
a
|
366
|
+
else
|
367
|
+
input_b = read_final_result(b)
|
368
|
+
if a.shape == input_b
|
369
|
+
a
|
370
|
+
else
|
371
|
+
input_a = read_final_result(a)
|
372
|
+
if input_b == []
|
373
|
+
if a.buffer.size == 1
|
374
|
+
a.shape = input_b
|
375
|
+
a
|
376
|
+
else
|
377
|
+
wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
|
378
|
+
end
|
379
|
+
else
|
380
|
+
wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
|
381
|
+
end
|
382
|
+
end
|
383
|
+
end
|
384
|
+
when :zeros, :ones, :zeros_like, :ones_like
|
385
|
+
shape = if %i[zeros_like ones_like].include?(tensor.operation)
|
386
|
+
_run(a, child_context).shape
|
387
|
+
else
|
388
|
+
read_final_result(complete_eval(a, child_context)) || tensor.shape.shape
|
389
|
+
end
|
390
|
+
|
391
|
+
func = if %i[zeros zeros_like].include?(tensor.operation)
|
392
|
+
-> { tensor.data_type == :int32 ? 0 : 0.0 }
|
393
|
+
else
|
394
|
+
-> { tensor.data_type == :int32 ? 1 : 1.0 }
|
395
|
+
end
|
396
|
+
|
397
|
+
size = shape.empty? ? 1 : shape.reduce(:*)
|
398
|
+
|
399
|
+
buffer = if TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type)
|
400
|
+
NArray.sfloat(size)
|
401
|
+
elsif TensorStream::Ops::INTEGER_TYPES.include?(tensor.data_type)
|
402
|
+
NArray.int(size)
|
403
|
+
else
|
404
|
+
raise "unsupported type #{tensor.data_type}"
|
405
|
+
end
|
406
|
+
|
407
|
+
data = if !shape.empty?
|
408
|
+
Array.new(size) do |index|
|
409
|
+
func.call
|
410
|
+
end
|
411
|
+
else
|
412
|
+
func.call
|
413
|
+
end
|
414
|
+
|
415
|
+
convert_to_opencl(data, shape, data_type: tensor.data_type, name: tensor.name)
|
416
|
+
when :broadcast_transform
|
417
|
+
a = _run(a, child_context)
|
418
|
+
b = _run(b, child_context)
|
419
|
+
|
420
|
+
if a.shape == b.shape
|
421
|
+
[a, b]
|
422
|
+
else
|
423
|
+
input_a = read_final_result(complete_eval(a, child_context))
|
424
|
+
input_b = read_final_result(complete_eval(b, child_context))
|
425
|
+
b_a, b_b = broadcast(input_a, input_b)
|
426
|
+
[ wrap_opencl(b_a, data_type: a.data_type, name: "#{tensor.name}_a"),
|
427
|
+
wrap_opencl(b_b, data_type: a.data_type, name: "#{tensor.name}_b")]
|
428
|
+
end
|
429
|
+
when :print
|
430
|
+
a = _run(a, child_context)
|
431
|
+
b = _run(b, child_context)
|
432
|
+
input_b = complete_eval(b, child_context)
|
433
|
+
input_b = read_final_result(input_b)
|
434
|
+
puts "#{tensor.options.fetch(:message, '')} #{input_b}"
|
435
|
+
a
|
436
|
+
when :rank
|
437
|
+
a = _run(a, child_context)
|
438
|
+
wrap_opencl(a.shape.size, data_type: tensor.data_type, name: tensor.name)
|
439
|
+
when :stop_gradient
|
440
|
+
_run(a, child_context)
|
441
|
+
when :slice
|
442
|
+
input_a = complete_eval(a, child_context)
|
443
|
+
input_b = read_final_result(complete_eval(b, child_context))
|
444
|
+
size = tensor.options[:size]
|
445
|
+
|
446
|
+
slice_param = input_b.zip(size).collect { |x, y| x..x + y - 1 }.reverse
|
447
|
+
|
448
|
+
new_buf = input_a.buffer.reshape(*input_a.shape.reverse)
|
449
|
+
sliced = new_buf.slice[*slice_param]
|
450
|
+
convert_to_opencl(sliced.flatten, sliced.shape.reverse, data_type: a.data_type, name: tensor.name)
|
451
|
+
when :transpose
|
452
|
+
input_a = complete_eval(a, child_context)
|
453
|
+
t_param = Array.new(input_a.shape.size) { |index| index }.reverse
|
454
|
+
transposed = input_a.buffer.reshape(*input_a.shape.reverse).transpose(*t_param)
|
455
|
+
convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: a.data_type, name: tensor.name)
|
456
|
+
when :index
|
457
|
+
a = complete_eval(a, child_context)
|
458
|
+
input_a = read_final_result(a)
|
459
|
+
index = read_final_result(complete_eval(b, child_context))
|
460
|
+
|
461
|
+
if a.is_a?(Array)
|
462
|
+
a[index]
|
463
|
+
else
|
464
|
+
new_shape = a.shape.dup
|
465
|
+
new_shape.shift
|
466
|
+
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
467
|
+
end
|
468
|
+
when :broadcast_gradient_args
|
469
|
+
a = complete_eval(a, child_context)
|
470
|
+
b = complete_eval(b, child_context)
|
471
|
+
|
472
|
+
wrap_opencl(get_broadcast_gradient_args(a.buffer.to_a, b.buffer.to_a), data_type: a.data_type, name: tensor.name)
|
473
|
+
when :shape
|
474
|
+
a = _run(a, child_context)
|
475
|
+
|
476
|
+
wrap_opencl(a.shape, name: tensor.name, data_type: tensor.options[:out_type] || :float32)
|
477
|
+
when :reshape
|
478
|
+
arr = complete_eval(a, child_context)
|
479
|
+
new_shape = read_final_result(complete_eval(b, child_context))
|
480
|
+
|
481
|
+
if new_shape.size.zero? && arr.buffer.size == 1
|
482
|
+
arr.shape = new_shape
|
483
|
+
arr
|
484
|
+
else
|
485
|
+
new_shape = TensorShape.fix_inferred_elements(new_shape, arr.buffer.size)
|
486
|
+
arr.shape = new_shape
|
487
|
+
arr
|
488
|
+
end
|
489
|
+
when :random_uniform
|
490
|
+
maxval = tensor.options.fetch(:maxval, 1)
|
491
|
+
minval = tensor.options.fetch(:minval, 0)
|
492
|
+
seed = tensor.options[:seed]
|
493
|
+
|
494
|
+
random = _get_randomizer(tensor, seed)
|
495
|
+
generator = -> { random.rand * (maxval - minval) + minval }
|
496
|
+
shape = tensor.options[:shape] || tensor.shape.shape
|
497
|
+
|
498
|
+
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
499
|
+
when :random_normal
|
500
|
+
random = _get_randomizer(tensor, seed)
|
501
|
+
r = RandomGaussian.new(tensor.options.fetch(:mean), tensor.options.fetch(:stddev), -> { random.rand })
|
502
|
+
random = _get_randomizer(tensor, seed)
|
503
|
+
generator = -> { r.rand }
|
504
|
+
shape = tensor.options[:shape] || tensor.shape.shape
|
505
|
+
|
506
|
+
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
507
|
+
when :glorot_uniform
|
508
|
+
random = _get_randomizer(tensor, seed)
|
509
|
+
|
510
|
+
shape = tensor.options[:shape] || tensor.shape.shape
|
511
|
+
fan_in, fan_out = if shape.size.zero?
|
512
|
+
[1, 1]
|
513
|
+
elsif shape.size == 1
|
514
|
+
[1, shape[0]]
|
515
|
+
else
|
516
|
+
[shape[0], shape.last]
|
517
|
+
end
|
518
|
+
|
519
|
+
limit = Math.sqrt(6.0 / (fan_in + fan_out))
|
520
|
+
|
521
|
+
minval = -limit
|
522
|
+
maxval = limit
|
523
|
+
|
524
|
+
generator = -> { random.rand * (maxval - minval) + minval }
|
525
|
+
convert_to_opencl(generate_vector(shape, generator: generator), shape, data_type: tensor.data_type, name: tensor.name)
|
526
|
+
when :flow_group
|
527
|
+
tensor.items.collect { |item| _run(item, child_context) }
|
528
|
+
when :sum
|
529
|
+
reduction(child_context, tensor, a, b, :sum)
|
530
|
+
when :mean
|
531
|
+
reduction(child_context, tensor, a, b, :mean)
|
532
|
+
when :prod
|
533
|
+
input_a = complete_eval(a, child_context)
|
534
|
+
if input_a.buffer.empty?
|
535
|
+
convert_to_opencl([1.0], [], data_type: a.data_type, name: tensor.name)
|
536
|
+
else
|
537
|
+
reduction(child_context, tensor, a, b, :prod)
|
538
|
+
end
|
539
|
+
when :argmin
|
540
|
+
a = complete_eval(a, child_context)
|
541
|
+
axis = tensor.options[:axis] || 0
|
542
|
+
arr = a.buffer.reshape(*a.shape.reverse).to_a
|
543
|
+
op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a < b })
|
544
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
545
|
+
when :argmax
|
546
|
+
a = complete_eval(a, child_context)
|
547
|
+
axis = tensor.options[:axis] || 0
|
548
|
+
arr = a.buffer.reshape(*a.shape.reverse).to_a
|
549
|
+
op = get_op_with_axis(arr, axis, 0, a.data_type, ->(a, b) { a > b })
|
550
|
+
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
551
|
+
else
|
552
|
+
raise "unknown op #{tensor.operation}"
|
553
|
+
end.tap do |result|
|
554
|
+
if tensor.breakpoint
|
555
|
+
a = read_final_result(complete_eval(a, child_context))
|
556
|
+
b = read_final_result(complete_eval(b, child_context))
|
557
|
+
result = read_final_result(complete_eval(result, child_context))
|
558
|
+
|
559
|
+
tensor.breakpoint.call(tensor, a, b, result)
|
560
|
+
end
|
561
|
+
if @log_intermediates
|
562
|
+
@context[:compute_history] << {
|
563
|
+
name: tensor.name,
|
564
|
+
type: tensor.data_type,
|
565
|
+
shape: shape_eval(result),
|
566
|
+
source: tensor.source,
|
567
|
+
description: tensor.to_math(true, 1),
|
568
|
+
value: result
|
569
|
+
}
|
570
|
+
end
|
571
|
+
@context[tensor.name] = result
|
572
|
+
end
|
573
|
+
rescue EvaluatorExcecutionException => e
|
574
|
+
raise e
|
575
|
+
rescue StandardError => e
|
576
|
+
puts e.message
|
577
|
+
puts e.backtrace.join("\n")
|
578
|
+
|
579
|
+
# shape_a = a.shape.shape if a
|
580
|
+
# shape_b = b.shape.shape if b
|
581
|
+
# dtype_a = a.data_type if a
|
582
|
+
# dtype_b = b.data_type if b
|
583
|
+
# a = complete_eval(a, child_context)
|
584
|
+
# b = complete_eval(b, child_context)
|
585
|
+
# puts "name: #{tensor.given_name}"
|
586
|
+
# # puts "op: #{tensor.to_math(true, 1)}"
|
587
|
+
# puts "A #{shape_a} #{dtype_a}: #{a}" if a
|
588
|
+
# puts "B #{shape_b} #{dtype_b}: #{b}" if b
|
589
|
+
# dump_intermediates if @log_intermediates
|
590
|
+
# File.write('/home/jedld/workspace/tensor_stream/samples/error.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
|
591
|
+
|
592
|
+
# File.write('/Users/josephemmanueldayo/workspace/gradients.graphml', TensorStream::Graphml.new.get_string(tensor, @session))
|
593
|
+
raise EvaluatorExcecutionException.new(e, tensor), "error #{e.message} while evaluating #{tensor.name} : #{tensor.to_math(true,1)} defined at #{tensor.source}"
|
594
|
+
end
|
595
|
+
|
596
|
+
def eval_tensor(tensor, child_context)
|
597
|
+
return tensor unless tensor.is_a?(Tensor)
|
598
|
+
|
599
|
+
cache_key = "#{tensor.graph.object_id}_opencl_#{tensor.name}"
|
600
|
+
return @context[cache_key] if @context.key?(cache_key)
|
601
|
+
return @context[:_cache][cache_key] if tensor.is_const && @context[:_cache][cache_key]
|
602
|
+
@context[cache_key] = if tensor.value.is_a?(Tensor)
|
603
|
+
_run(tensor.value, child_context)
|
604
|
+
else
|
605
|
+
wrap_opencl(tensor, name: tensor.name)
|
606
|
+
end
|
607
|
+
@context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
|
608
|
+
end
|
609
|
+
|
610
|
+
private
|
611
|
+
|
612
|
+
def assign_var(tensor, b, child_context)
|
613
|
+
assign = tensor.items[0] || tensor
|
614
|
+
buffer = complete_eval(b, child_context)
|
615
|
+
if assign.buffer
|
616
|
+
assign.buffer.op = _opencl_queue.enqueue_write_buffer(assign.buffer.cl_buffer, buffer.buffer)
|
617
|
+
else
|
618
|
+
assign.buffer = convert_to_opencl(read_final_result(buffer), buffer.shape, data_type: tensor.data_type, name: tensor.name)
|
619
|
+
end
|
620
|
+
assign.buffer.dirty = true
|
621
|
+
assign.buffer
|
622
|
+
end
|
623
|
+
|
624
|
+
def execute_2_operand_func(op_name, tensor, input_a, input_b, child_context, prog_name = nil)
|
625
|
+
a = _run(input_a, child_context)
|
626
|
+
b = _run(input_b, child_context)
|
627
|
+
a, b = type_cast(a, b)
|
628
|
+
dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
629
|
+
result_shape = TensorShape.infer_shape(a.shape, b.shape)
|
630
|
+
|
631
|
+
output_buffer = _create_result_buffer(tensor.data_type, result_shape, tensor.name)
|
632
|
+
a, b, prog, switch_operands = select_program(a, b, op_name)
|
633
|
+
m, n = result_shape
|
634
|
+
work_group = [m || 1, n || 1]
|
635
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
636
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
637
|
+
cl_switch = OpenCL::Int1.new(switch_operands) # no need to switch for addition
|
638
|
+
|
639
|
+
event_wait_list = [a.op, b.op].compact # add dependency wait list
|
640
|
+
|
641
|
+
event = if prog == "#{op_name}_b"
|
642
|
+
cl_m_b, cl_n_b = if b.shape.size == 2
|
643
|
+
[ OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1]) ]
|
644
|
+
elsif b.shape.size == 1
|
645
|
+
[ OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0]) ]
|
646
|
+
else
|
647
|
+
raise "rank > 2 not supported!"
|
648
|
+
end
|
649
|
+
_cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
650
|
+
else
|
651
|
+
_cl_program("#{prog_name || op_name}").send(:"#{prog}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
652
|
+
end
|
653
|
+
|
654
|
+
output_buffer.op = event
|
655
|
+
output_buffer
|
656
|
+
end
|
657
|
+
|
658
|
+
def execute_cond_func(op_name, tensor, pred, input_a, input_b, child_context)
|
659
|
+
p = _run(pred, child_context)
|
660
|
+
a = _run(input_a, child_context)
|
661
|
+
b = _run(input_b, child_context)
|
662
|
+
|
663
|
+
a, b = type_cast(a, b)
|
664
|
+
dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
665
|
+
|
666
|
+
output_buffer = _create_result_buffer(tensor.data_type, p.shape, tensor.name)
|
667
|
+
|
668
|
+
m, n = p.shape
|
669
|
+
work_group = [m || 1, n || 1]
|
670
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
671
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
672
|
+
|
673
|
+
event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
|
674
|
+
output_buffer.op = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
675
|
+
output_buffer
|
676
|
+
end
|
677
|
+
|
678
|
+
def execute_func(op_name, tensor, a, child_context)
|
679
|
+
a = _run(a, child_context)
|
680
|
+
event_wait_list = [a.op].compact
|
681
|
+
dtype = TensorStream::Ops::FLOATING_POINT_TYPES.include?(tensor.data_type) ? 'fp' : 'int'
|
682
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
683
|
+
|
684
|
+
m, n = a.shape
|
685
|
+
work_group = [m || 1, n || 1]
|
686
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
687
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
688
|
+
|
689
|
+
event = _cl_program("#{op_name}").send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
690
|
+
output_buffer.op = event
|
691
|
+
output_buffer
|
692
|
+
end
|
693
|
+
|
694
|
+
def type_cast(a, b)
|
695
|
+
return [a, b] if a.data_type == b.data_type
|
696
|
+
m, n = b.shape
|
697
|
+
work_group = [m || 1, n || 1]
|
698
|
+
buffer = buffer_for(b.shape, b.data_type)
|
699
|
+
if (TensorStream::Ops::FLOATING_POINT_TYPES.include?(a.data_type.to_sym))
|
700
|
+
if TensorStream::Ops::INTEGER_TYPES.include?(b.data_type.to_sym)
|
701
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
702
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
703
|
+
|
704
|
+
_cl_program("cast").cast_int_fp(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
|
705
|
+
return [a, buffer]
|
706
|
+
end
|
707
|
+
elsif TensorStream::Ops::INTEGER_TYPES.include?(a.data_type.to_sym)
|
708
|
+
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(b.data_type.to_sym)
|
709
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
710
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
711
|
+
_cl_program("cast").cast_fp_int(_opencl_queue, work_group, cl_m, cl_n, b.cl_buffer, buffer.cl_buffer)
|
712
|
+
return [a, buffer]
|
713
|
+
end
|
714
|
+
end
|
715
|
+
|
716
|
+
[a, b]
|
717
|
+
end
|
718
|
+
|
719
|
+
def buffer_for(shape, data_type)
|
720
|
+
size = shape.empty? ? 1 : shape.reduce(:*)
|
721
|
+
|
722
|
+
buffer = allocate_narray_for_type(data_type, size)
|
723
|
+
|
724
|
+
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
725
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
726
|
+
end
|
727
|
+
|
728
|
+
def wrap_opencl(tensor, data_type: nil, name: nil)
|
729
|
+
value, shape = if tensor.is_a?(Tensor)
|
730
|
+
[tensor.value, tensor.shape.shape]
|
731
|
+
else
|
732
|
+
[tensor , shape_eval(tensor)]
|
733
|
+
end
|
734
|
+
|
735
|
+
convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
|
736
|
+
end
|
737
|
+
|
738
|
+
def convert_to_opencl(value, shape, data_type: nil, name: nil)
|
739
|
+
if !value.is_a?(Array) && !value.is_a?(NArray)
|
740
|
+
value = [value]
|
741
|
+
end
|
742
|
+
|
743
|
+
cache_key = "_cl_object_#{name}_#{shape.join('_')}"
|
744
|
+
cl_object = if name && @context[:_cache][cache_key]
|
745
|
+
@context[:_cache][cache_key]
|
746
|
+
else
|
747
|
+
narray_size = shape.reduce(:*) || 1
|
748
|
+
|
749
|
+
buffer = if value.is_a?(NArray)
|
750
|
+
value
|
751
|
+
else
|
752
|
+
allocate_narray_for_type(data_type, narray_size)
|
753
|
+
end
|
754
|
+
|
755
|
+
cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
|
756
|
+
|
757
|
+
cl_buffer = if !value.flatten.empty?
|
758
|
+
cl_buffer_size = 1 if cl_buffer_size.zero?
|
759
|
+
_opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
|
760
|
+
else
|
761
|
+
nil
|
762
|
+
end
|
763
|
+
|
764
|
+
@context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
765
|
+
end
|
766
|
+
|
767
|
+
if value.is_a?(Array)
|
768
|
+
value.flatten.each_with_index do |element, index|
|
769
|
+
if element.is_a?(Tensor)
|
770
|
+
cl_object.buffer[index] = read_final_result(complete_eval(element, {}))
|
771
|
+
else
|
772
|
+
cl_object.buffer[index] = Tensor.cast_dtype(element, data_type)
|
773
|
+
end
|
774
|
+
end
|
775
|
+
elsif value.is_a?(NArray)
|
776
|
+
cl_object.buffer = value
|
777
|
+
else
|
778
|
+
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
779
|
+
end
|
780
|
+
|
781
|
+
write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
782
|
+
_opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
|
783
|
+
end
|
784
|
+
cl_object.op = write_op
|
785
|
+
cl_object
|
786
|
+
end
|
787
|
+
|
788
|
+
def allocate_narray_for_type(data_type, narray_size)
|
789
|
+
if TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym) || TensorStream::Ops::FLOATING_POINT_TYPES.include?(data_type.to_sym)
|
790
|
+
NArray.sfloat(narray_size)
|
791
|
+
elsif TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym) || TensorStream::Ops::INTEGER_TYPES.include?(data_type.to_sym)
|
792
|
+
NArray.int(narray_size)
|
793
|
+
elsif data_type.to_sym == :boolean
|
794
|
+
NArray.int(narray_size)
|
795
|
+
else
|
796
|
+
raise "unsupported type #{data_type}"
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
def _create_result_buffer(data_type, shape, name)
|
801
|
+
@context[:_cache]["_result_#{name}_#{shape.join('_')}"] ||= begin
|
802
|
+
size = shape.empty? ? 1 : shape.reduce(:*)
|
803
|
+
buffer = allocate_narray_for_type(data_type, size)
|
804
|
+
cl_buffer = _opencl_context.create_buffer(buffer.size * buffer.element_size)
|
805
|
+
OpenCLBuffer.new(data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
806
|
+
end
|
807
|
+
end
|
808
|
+
|
809
|
+
def get_op_with_axis(a, target_axis, current_axis, output_type, op = ->(t, u) { t > u })
|
810
|
+
if target_axis == current_axis
|
811
|
+
if a[0].is_a?(Array)
|
812
|
+
(0...a[0].size).each.collect do |column_index|
|
813
|
+
max = nil
|
814
|
+
max_index = 0
|
815
|
+
a.each_with_index do |row, row_index|
|
816
|
+
if max.nil? || op.call(row[column_index], max)
|
817
|
+
max = row[column_index]
|
818
|
+
max_index = row_index
|
819
|
+
end
|
820
|
+
end
|
821
|
+
|
822
|
+
Tensor.cast_dtype(max_index, output_type)
|
823
|
+
end
|
824
|
+
else
|
825
|
+
max = nil
|
826
|
+
max_index = 0
|
827
|
+
a.each_with_index do |x, index|
|
828
|
+
if max.nil? || op.call(x, max)
|
829
|
+
max = x
|
830
|
+
max_index = index
|
831
|
+
end
|
832
|
+
end
|
833
|
+
Tensor.cast_dtype(max_index, output_type)
|
834
|
+
end
|
835
|
+
else
|
836
|
+
a.collect do |row|
|
837
|
+
get_op_with_axis(row, target_axis, current_axis + 1, output_type, op)
|
838
|
+
end
|
839
|
+
end
|
840
|
+
end
|
841
|
+
|
842
|
+
def reduction(child_context, tensor, a, b, func)
|
843
|
+
input = complete_eval(a, child_context)
|
844
|
+
axis = read_final_result(complete_eval(b, child_context))
|
845
|
+
if axis.nil?
|
846
|
+
convert_to_opencl(input.buffer.send(func), [], data_type: tensor.data_type, name: tensor.name)
|
847
|
+
else
|
848
|
+
return input if input.shape.empty?
|
849
|
+
value = input.buffer.reshape(*input.shape.reverse)
|
850
|
+
rank = input.shape.size - 1
|
851
|
+
|
852
|
+
if axis.is_a?(Array)
|
853
|
+
axis.map{ |x| rank - x.abs }.sort.reverse.each do |x|
|
854
|
+
value = value.send(func, x)
|
855
|
+
end
|
856
|
+
else
|
857
|
+
value = value.send(func, rank - axis.abs)
|
858
|
+
end
|
859
|
+
|
860
|
+
new_shape = if value.is_a?(NArray)
|
861
|
+
value.shape.reverse
|
862
|
+
else
|
863
|
+
value = [value]
|
864
|
+
[]
|
865
|
+
end
|
866
|
+
|
867
|
+
if tensor.options[:keepdims]
|
868
|
+
new_shape = reduced_shape(input.shape.dup, axis)
|
869
|
+
end
|
870
|
+
|
871
|
+
convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
|
872
|
+
end
|
873
|
+
end
|
874
|
+
|
875
|
+
def arr_pad(arr, paddings, data_type = :float32, rank = 0)
|
876
|
+
raise "padding #{paddings[rank]} needs to have to elements [before, after]" if paddings[rank].size != 2
|
877
|
+
|
878
|
+
before = paddings[rank][0]
|
879
|
+
after = paddings[rank][1]
|
880
|
+
pad_value = fp_type?(data_type) ? 0.0 : 0
|
881
|
+
if arr[0].is_a?(Array)
|
882
|
+
next_dim_elem = arr.collect { |a| arr_pad(a, paddings, data_type, rank + 1) }
|
883
|
+
padding = deep_dup_array(next_dim_elem[0], pad_value)
|
884
|
+
Array.new(before) { padding } + next_dim_elem + Array.new(after) { padding }
|
885
|
+
else
|
886
|
+
Array.new(before) { pad_value } + arr + Array.new(after) { pad_value }
|
887
|
+
end
|
888
|
+
end
|
889
|
+
|
890
|
+
def deep_dup_array(arr, value = nil)
|
891
|
+
if arr.is_a?(Array)
|
892
|
+
arr.dup.collect do |a|
|
893
|
+
deep_dup_array(a, value)
|
894
|
+
end
|
895
|
+
else
|
896
|
+
value.nil? ? arr : value
|
897
|
+
end
|
898
|
+
end
|
899
|
+
|
900
|
+
def matmul_const_transform(mat, mat_b, tensor)
|
901
|
+
if !mat.is_a?(Array)
|
902
|
+
compat_shape = shape_eval(mat_b).reverse
|
903
|
+
func = -> { tensor.data_type == :int32 ? mat.to_i : mat.to_f }
|
904
|
+
|
905
|
+
generate_vector(compat_shape, generator: func)
|
906
|
+
else
|
907
|
+
mat
|
908
|
+
end
|
909
|
+
end
|
910
|
+
|
911
|
+
# determine possible reduction axis to be used
|
912
|
+
def _broadcast_gradient_op(vector_shape1, vector_shape2, level)
|
913
|
+
va_rank = _rank_from_shape(vector_shape1)
|
914
|
+
vb_rank = _rank_from_shape(vector_shape2)
|
915
|
+
return [] if vector_shape1 == vector_shape2 # same shape so no reductions
|
916
|
+
|
917
|
+
shape2_r = vector_shape2.reverse
|
918
|
+
|
919
|
+
vector_shape1.reverse.each_with_index.collect do |s, index|
|
920
|
+
next va_rank - index - 1 if index >= shape2_r.size
|
921
|
+
next nil if shape2_r[index] == s
|
922
|
+
next nil if shape2_r[index] > s
|
923
|
+
va_rank - index - 1
|
924
|
+
end.compact
|
925
|
+
end
|
926
|
+
|
927
|
+
# selects variants of cl programs depending on input
|
928
|
+
def select_program(input_a, input_b, op)
|
929
|
+
return [input_a, input_b, "#{op}", 0] if input_a.shape == input_b.shape
|
930
|
+
|
931
|
+
return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
|
932
|
+
return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
|
933
|
+
|
934
|
+
return [input_b, input_a, "#{op}_b", 1] if input_a.shape.size < input_b.shape.size
|
935
|
+
|
936
|
+
if input_a.shape.size == input_b.shape.size
|
937
|
+
input_a.shape.zip(input_b.shape).each do |s1, s2|
|
938
|
+
return [input_b, input_a, "#{op}_b", 1] if s1 < s2
|
939
|
+
end
|
940
|
+
end
|
941
|
+
|
942
|
+
[input_a, input_b, "#{op}_b", 0]
|
943
|
+
end
|
944
|
+
|
945
|
+
def _rank_from_shape(shape)
|
946
|
+
shape.is_a?(Array) ? shape.size : 0
|
947
|
+
end
|
948
|
+
|
949
|
+
def get_broadcast_gradient_args(input_a, input_b)
|
950
|
+
return [] if get_rank(input_b).zero? && get_rank(input_a).zero?
|
951
|
+
return nil if get_rank(input_b).zero?
|
952
|
+
# ruby scalar
|
953
|
+
if get_rank(input_a).zero?
|
954
|
+
_broadcast_gradient_op(input_b, input_a, 0, true)
|
955
|
+
elsif get_rank(input_a) > 0
|
956
|
+
_broadcast_gradient_op(input_a, input_b, 0)
|
957
|
+
end
|
958
|
+
end
|
959
|
+
|
960
|
+
def concat_array(values, axis)
|
961
|
+
combined_array = values.shift
|
962
|
+
axis = get_rank(combined_array) - 1 if axis == -1
|
963
|
+
|
964
|
+
values.each do |v|
|
965
|
+
combined_array = concat(combined_array, v, axis)
|
966
|
+
end
|
967
|
+
combined_array
|
968
|
+
end
|
969
|
+
|
970
|
+
def concat(a, b, axis)
|
971
|
+
if axis.zero?
|
972
|
+
a + b
|
973
|
+
else
|
974
|
+
a.each_with_index.collect do |i, index|
|
975
|
+
concat(i, b[index], axis - 1)
|
976
|
+
end
|
977
|
+
end
|
978
|
+
end
|
979
|
+
|
980
|
+
def resolve_placeholder(placeholder, _execution_context = {})
|
981
|
+
return nil if placeholder.nil?
|
982
|
+
return placeholder if retain.include?(placeholder)
|
983
|
+
|
984
|
+
var = if placeholder.is_a?(Placeholder)
|
985
|
+
@context[placeholder.name.to_sym].tap do |c|
|
986
|
+
raise "missing placeholder #{placeholder.name}" if c.nil?
|
987
|
+
end
|
988
|
+
else
|
989
|
+
placeholder
|
990
|
+
end
|
991
|
+
|
992
|
+
return convert_to_opencl(var, shape_eval(var), data_type: placeholder.data_type, name: placeholder.name) unless var.is_a?(Tensor)
|
993
|
+
Tensor.cast_dtype(var, placeholder.data_type)
|
994
|
+
end
|
995
|
+
|
996
|
+
def reduce_axis(current_axis, axis, val, keep_dims, f = ->(a, b) { a + b })
|
997
|
+
return val unless val.is_a?(Array)
|
998
|
+
|
999
|
+
r = val.collect do |v|
|
1000
|
+
reduce_axis(current_axis + 1, axis, v, keep_dims, f)
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
should_reduce_axis = axis.nil? || (axis.is_a?(Array) && axis.include?(current_axis)) || (current_axis == axis)
|
1004
|
+
|
1005
|
+
if should_reduce_axis
|
1006
|
+
reduced_val = r[0]
|
1007
|
+
if r.size > 1
|
1008
|
+
reduced_val = f.call(r[0..val.size])
|
1009
|
+
elsif r.size == 0
|
1010
|
+
reduced_val = f.call(nil)
|
1011
|
+
end
|
1012
|
+
keep_dims ? [ reduced_val ] : reduced_val
|
1013
|
+
else
|
1014
|
+
r
|
1015
|
+
end
|
1016
|
+
end
|
1017
|
+
|
1018
|
+
# handle 3 tensor math operations
|
1019
|
+
def call_3way_vector_op(v_a, v_b, v_c, child_context, op = ->(a, b, c) { a + b + c })
|
1020
|
+
return op.call(v_a, v_b, v_c) unless v_a.is_a?(Array)
|
1021
|
+
|
1022
|
+
v_a.each_with_index.collect do |v1, index|
|
1023
|
+
v2 = v_b[index]
|
1024
|
+
v3 = v_c[index]
|
1025
|
+
if v1.is_a?(Array)
|
1026
|
+
call_3way_vector_op(v1, v2, v3, child_context, op)
|
1027
|
+
else
|
1028
|
+
op.call(v1, v2, v3)
|
1029
|
+
end
|
1030
|
+
end
|
1031
|
+
end
|
1032
|
+
|
1033
|
+
def all_true?(arr)
|
1034
|
+
if arr.is_a?(Array) || arr.is_a?(NArray)
|
1035
|
+
arr.each do |a|
|
1036
|
+
return false unless all_true?(a)
|
1037
|
+
end
|
1038
|
+
return true
|
1039
|
+
end
|
1040
|
+
|
1041
|
+
arr != 0
|
1042
|
+
end
|
1043
|
+
|
1044
|
+
def generate_vector(shape, dtype: :float32, generator:)
|
1045
|
+
if shape.is_a?(Integer)
|
1046
|
+
Array.new(shape) do
|
1047
|
+
generator.call
|
1048
|
+
end
|
1049
|
+
elsif shape.size > 1
|
1050
|
+
Array.new(shape[0]) do
|
1051
|
+
generate_vector(shape[1..shape.size], generator: generator, dtype: dtype)
|
1052
|
+
end
|
1053
|
+
elsif shape.size == 1
|
1054
|
+
Array.new(shape[0]) do
|
1055
|
+
generator.call
|
1056
|
+
end
|
1057
|
+
elsif shape.size.zero?
|
1058
|
+
generator.call
|
1059
|
+
end
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
def _get_randomizer(tensor, seed)
|
1063
|
+
if tensor.graph.random_seed && seed
|
1064
|
+
Random.new(tensor.graph.random_seed ^ seed)
|
1065
|
+
elsif tensor.graph.random_seed
|
1066
|
+
@session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
|
1067
|
+
@session.randomizer[tensor.graph.object_id]
|
1068
|
+
elsif seed
|
1069
|
+
@session.randomizer[tensor.operation] ||= Random.new(seed)
|
1070
|
+
@session.randomizer[tensor.operation]
|
1071
|
+
else
|
1072
|
+
Random.new
|
1073
|
+
end
|
1074
|
+
end
|
1075
|
+
|
1076
|
+
def dump_intermediates
|
1077
|
+
arr = []
|
1078
|
+
arr << "============== start ==================="
|
1079
|
+
@context[:compute_history].each_with_index do |history, index|
|
1080
|
+
arr << "------------------------------------"
|
1081
|
+
arr << history[:name]
|
1082
|
+
arr << "#{history[:type]} #{history[:shape]}"
|
1083
|
+
arr << history[:source]
|
1084
|
+
arr << history[:description]
|
1085
|
+
arr << ""
|
1086
|
+
arr << history[:value].to_json
|
1087
|
+
arr << "------------------------------------"
|
1088
|
+
end
|
1089
|
+
arr << "============== end ====================="
|
1090
|
+
str = arr.join("\n")
|
1091
|
+
File.write("/tmp/intermediates.txt", str)
|
1092
|
+
end
|
1093
|
+
end
|
1094
|
+
end
|
1095
|
+
end
|