tensor_stream 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -0
- data/CHANGELOG.md +8 -0
- data/README.md +40 -1
- data/benchmark/benchmark.rb +4 -1
- data/lib/tensor_stream.rb +5 -0
- data/lib/tensor_stream/debugging/debugging.rb +4 -2
- data/lib/tensor_stream/device.rb +2 -1
- data/lib/tensor_stream/evaluator/base_evaluator.rb +43 -32
- data/lib/tensor_stream/evaluator/evaluator.rb +0 -1
- data/lib/tensor_stream/evaluator/opencl/kernels/acos.cl +8 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/apply_gradient.cl +9 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/asin.cl +9 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/floor_mod.cl +3 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/log_softmax.cl +26 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/max.cl +5 -5
- data/lib/tensor_stream/evaluator/opencl/kernels/min.cl +46 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/real_div.cl +3 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross.cl +27 -0
- data/lib/tensor_stream/evaluator/opencl/kernels/softmax_cross_grad.cl +28 -0
- data/lib/tensor_stream/evaluator/opencl/opencl_buffer.rb +5 -6
- data/lib/tensor_stream/evaluator/opencl/opencl_evaluator.rb +200 -265
- data/lib/tensor_stream/evaluator/operation_helpers/array_ops_helper.rb +4 -8
- data/lib/tensor_stream/evaluator/ruby_evaluator.rb +193 -122
- data/lib/tensor_stream/exceptions.rb +6 -0
- data/lib/tensor_stream/graph.rb +21 -6
- data/lib/tensor_stream/graph_builder.rb +67 -0
- data/lib/tensor_stream/graph_deserializers/protobuf.rb +271 -0
- data/lib/tensor_stream/graph_keys.rb +1 -0
- data/lib/tensor_stream/graph_serializers/pbtext.rb +11 -10
- data/lib/tensor_stream/helpers/op_helper.rb +7 -33
- data/lib/tensor_stream/helpers/string_helper.rb +16 -0
- data/lib/tensor_stream/math_gradients.rb +67 -44
- data/lib/tensor_stream/nn/nn_ops.rb +7 -1
- data/lib/tensor_stream/operation.rb +14 -27
- data/lib/tensor_stream/ops.rb +82 -29
- data/lib/tensor_stream/session.rb +4 -0
- data/lib/tensor_stream/tensor.rb +30 -12
- data/lib/tensor_stream/tensor_shape.rb +1 -1
- data/lib/tensor_stream/train/gradient_descent_optimizer.rb +37 -4
- data/lib/tensor_stream/train/saver.rb +46 -0
- data/lib/tensor_stream/train/utils.rb +37 -0
- data/lib/tensor_stream/trainer.rb +2 -0
- data/lib/tensor_stream/utils.rb +24 -14
- data/lib/tensor_stream/variable.rb +5 -11
- data/lib/tensor_stream/variable_scope.rb +15 -0
- data/lib/tensor_stream/version.rb +1 -1
- data/samples/iris.rb +8 -4
- data/samples/linear_regression.rb +1 -1
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +3 -3
- data/tensor_stream.gemspec +1 -1
- data/test_samples/raw_neural_net_sample.rb +4 -1
- metadata +21 -6
@@ -0,0 +1,46 @@
|
|
1
|
+
// same dimension add floating point op
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void min_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
6
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
7
|
+
|
8
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[globalRow * N + globalCol] ? A[globalRow * N + globalCol] : B[globalRow * N + globalCol];
|
9
|
+
}
|
10
|
+
|
11
|
+
// 1D + Scalar floating point add op
|
12
|
+
__kernel void min_c_<%= dtype %>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
13
|
+
// Get the index of the current element to be processed
|
14
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
15
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
+
|
17
|
+
if (switch_op == 0) {
|
18
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[0] ? A[globalRow * N + globalCol] : B[0];
|
19
|
+
} else {
|
20
|
+
C[globalRow * N + globalCol] = B[0] <= A[globalRow * N + globalCol] ? B[0] : A[globalRow * N + globalCol];
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
// 1D + Scalar floating point add op broadcast
|
25
|
+
__kernel void min_b_<%= dtype %>_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op, __global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
26
|
+
// Get the index of the current element to be processed
|
27
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
28
|
+
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
29
|
+
|
30
|
+
int b_m_index = globalRow;
|
31
|
+
int b_n_index = globalCol;
|
32
|
+
|
33
|
+
if ( b_m_index >= M2) {
|
34
|
+
b_m_index = b_m_index % M2;
|
35
|
+
};
|
36
|
+
|
37
|
+
if (b_n_index >= N2) {
|
38
|
+
b_n_index = b_n_index % N2;
|
39
|
+
}
|
40
|
+
|
41
|
+
if (switch_op == 0) {
|
42
|
+
C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <= B[b_m_index * N2 + b_n_index] ? A[globalRow * N + globalCol] : B[b_m_index * N2 + b_n_index];
|
43
|
+
} else {
|
44
|
+
C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <= A[globalRow * N + globalCol] ? B[b_m_index * N2 + b_n_index] : A[globalRow * N + globalCol];
|
45
|
+
}
|
46
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void softmax_cross_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
const __global <%= c_dtype %>* L,
|
6
|
+
__global <%= c_dtype %>* C) {
|
7
|
+
|
8
|
+
// Get the index of the current element to be processed
|
9
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
10
|
+
|
11
|
+
// Compute a single element (loop over K)
|
12
|
+
<%= c_dtype %> acc = 0.0f;
|
13
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
14
|
+
|
15
|
+
for (int k=0; k<N; k++) {
|
16
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
17
|
+
}
|
18
|
+
|
19
|
+
for (int k=0; k<N; k++) {
|
20
|
+
acc += exp(A[globalRow*N + k] - max);
|
21
|
+
}
|
22
|
+
|
23
|
+
// Store the result
|
24
|
+
for (int k=0; k < N; k++) {
|
25
|
+
C[globalRow*N + k] = (log(acc) - (A[globalRow*N + k] - max)) * L[globalRow*N + k];
|
26
|
+
}
|
27
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
// First naive implementation
|
2
|
+
% c_dtype = dtype_to_c_type(dtype)
|
3
|
+
__kernel void softmax_cross_grad_<%= dtype %>(const int N,
|
4
|
+
const __global <%= c_dtype %>* A,
|
5
|
+
const __global <%= c_dtype %>* L,
|
6
|
+
const __global <%= c_dtype %>* G,
|
7
|
+
__global <%= c_dtype %>* C) {
|
8
|
+
|
9
|
+
// Get the index of the current element to be processed
|
10
|
+
const int globalRow = get_global_id(0); // Row ID of C (0..M)
|
11
|
+
|
12
|
+
// Compute a single element (loop over K)
|
13
|
+
<%= c_dtype %> acc = 0.0f;
|
14
|
+
<%= c_dtype %> max = <%= min_value_for(dtype) %>;
|
15
|
+
|
16
|
+
for (int k=0; k<N; k++) {
|
17
|
+
max = A[globalRow*N + k] > max ? A[globalRow*N + k] : max;
|
18
|
+
}
|
19
|
+
|
20
|
+
for (int k=0; k<N; k++) {
|
21
|
+
acc += exp(A[globalRow*N + k] - max);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Store the result
|
25
|
+
for (int k=0; k < N; k++) {
|
26
|
+
C[globalRow*N + k] = ((exp(A[globalRow*N + k] - max)/acc) * G[globalRow*N + k] - L[globalRow*N + k]);
|
27
|
+
}
|
28
|
+
}
|
@@ -1,10 +1,11 @@
|
|
1
1
|
module TensorStream
|
2
|
+
# Buffer used by the OpenCL evaluator
|
2
3
|
class OpenCLBuffer < Buffer
|
3
4
|
include ArrayOpsHelper
|
4
5
|
|
5
6
|
attr_accessor :shape, :buffer, :cl_buffer, :op
|
6
7
|
|
7
|
-
def initialize(data_type
|
8
|
+
def initialize(data_type:, shape:, buffer:, cl_buffer:, op: nil, name: nil)
|
8
9
|
@data_type = data_type
|
9
10
|
@shape = shape
|
10
11
|
@buffer = buffer
|
@@ -25,12 +26,10 @@ module TensorStream
|
|
25
26
|
op.command_queue.finish
|
26
27
|
self.dirty = false
|
27
28
|
end
|
28
|
-
result = buffer.reshape(*shape.map { |s| s.to_i}.reverse).to_a
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
end
|
30
|
+
result = buffer.reshape(*shape.map(&:to_i).reverse).to_a
|
31
|
+
result = process_function_op(result, ->(a, _b) { a != 0 }) if data_type == :boolean
|
33
32
|
result
|
34
33
|
end
|
35
34
|
end
|
36
|
-
end
|
35
|
+
end
|
@@ -102,6 +102,7 @@ module TensorStream
|
|
102
102
|
|
103
103
|
def complete_eval(tensor, context)
|
104
104
|
buffer = _run(tensor, context)
|
105
|
+
|
105
106
|
if buffer.is_a?(Array)
|
106
107
|
buffer = buffer.collect do |b|
|
107
108
|
next b if b.buffer.size.zero?
|
@@ -109,7 +110,8 @@ module TensorStream
|
|
109
110
|
b
|
110
111
|
end
|
111
112
|
else
|
112
|
-
return buffer if buffer.
|
113
|
+
return buffer.outputs[0] if buffer.is_a?(OutputGroup)
|
114
|
+
return buffer if buffer.nil?
|
113
115
|
return [] if buffer.buffer.nil?
|
114
116
|
return buffer if buffer.buffer.size.zero?
|
115
117
|
_opencl_queue.enqueue_read_buffer(buffer.cl_buffer, buffer.buffer, event_wait_list: [buffer.op].compact)
|
@@ -150,13 +152,6 @@ module TensorStream
|
|
150
152
|
@opencl_context = OpenCL.create_context(opencl_device)
|
151
153
|
end
|
152
154
|
|
153
|
-
def choose_best_device
|
154
|
-
@best_device ||= begin
|
155
|
-
devices = OpenclEvaluator.query_devices_with_score
|
156
|
-
devices.sort { |a| a[1] }.reverse.first
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
155
|
def self.query_devices_with_score
|
161
156
|
OpenCL.platforms.flat_map do |p|
|
162
157
|
|
@@ -282,18 +277,82 @@ module TensorStream
|
|
282
277
|
assign_var(tensor, value, context)
|
283
278
|
end
|
284
279
|
|
280
|
+
# Fast in place multiply subtract assign
|
281
|
+
register_op :apply_gradient_descent do |_context, tensor, inputs|
|
282
|
+
_target_var, learning_rate, delta = inputs
|
283
|
+
|
284
|
+
assign = tensor.inputs[0] || tensor
|
285
|
+
|
286
|
+
unless assign.buffer
|
287
|
+
value = read_final_result(buffer)
|
288
|
+
assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
|
289
|
+
assign.value = value
|
290
|
+
end
|
291
|
+
|
292
|
+
assign.buffer.dirty = true # force buffer copy when variable is read externally
|
293
|
+
output_buffer = assign.buffer
|
294
|
+
|
295
|
+
m, n = output_buffer.shape
|
296
|
+
work_group = [m || 1, n || 1]
|
297
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
298
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
299
|
+
|
300
|
+
event_wait_list = [assign.buffer.op, learning_rate.op, delta.op].compact # add dependency wait list
|
301
|
+
method_call = :"apply_gradient_#{output_buffer.data_type}"
|
302
|
+
event = _cl_program("apply_gradient", dtype: output_buffer.data_type).send(method_call, _opencl_queue, work_group, cl_m, cl_n, delta.cl_buffer, learning_rate.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
303
|
+
output_buffer.op = event
|
304
|
+
output_buffer
|
305
|
+
end
|
306
|
+
|
285
307
|
%i[less less_equal greater greater_equal equal not_equal logical_and].each do |op|
|
286
308
|
register_op op, noop: true do |context, tensor, inputs|
|
287
309
|
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context, 'cond')
|
288
310
|
end
|
289
311
|
end
|
290
312
|
|
291
|
-
%i[max add div sub mod mul pow sigmoid_grad squared_difference].each do |op|
|
313
|
+
%i[max min add real_div div sub floor_mod mod mul pow sigmoid_grad squared_difference].each do |op|
|
292
314
|
register_op op, noop: true do |context, tensor, inputs|
|
293
315
|
execute_2_operand_func(op.to_s, tensor, inputs[0], inputs[1], context)
|
294
316
|
end
|
295
317
|
end
|
296
318
|
|
319
|
+
register_op :add_n do |_context, tensor, inputs|
|
320
|
+
if inputs.size == 1
|
321
|
+
inputs[0]
|
322
|
+
else
|
323
|
+
m, n = inputs[0].shape
|
324
|
+
work_group = [m || 1, n || 1]
|
325
|
+
cl_m = OpenCL::Int1.new(m || 1)
|
326
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
327
|
+
cl_switch = OpenCL::Int1.new(0)
|
328
|
+
dtype = tensor.data_type
|
329
|
+
|
330
|
+
output_buffer = _create_result_buffer(tensor.data_type, inputs[0].shape, "out_#{tensor.name}")
|
331
|
+
inputs_queue = inputs.dup
|
332
|
+
a = inputs_queue.pop
|
333
|
+
until inputs_queue.empty?
|
334
|
+
b = inputs_queue.pop
|
335
|
+
event_wait_list = [a.op, b.op].compact
|
336
|
+
method_call = :"add_#{a.data_type}_#{b.data_type}"
|
337
|
+
event = _cl_program('add', a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
338
|
+
a = output_buffer
|
339
|
+
a.op = event
|
340
|
+
end
|
341
|
+
|
342
|
+
output_buffer.op = a.op
|
343
|
+
output_buffer
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
register_op :expand_dims, buffer: true do |_context, tensor, inputs|
|
348
|
+
axis = inputs[1].buffer[0]
|
349
|
+
shape = inputs[0].shape.dup
|
350
|
+
axis = -axis if axis == shape.size
|
351
|
+
new_shape = shape.insert(axis, 1).compact
|
352
|
+
new_buf = inputs[0].buffer.reshape(*new_shape.reverse)
|
353
|
+
convert_to_opencl(new_buf, new_shape, data_type: inputs[0].data_type, name: tensor.name)
|
354
|
+
end
|
355
|
+
|
297
356
|
register_op :floor_div, noop: true do |context, tensor, inputs|
|
298
357
|
if fp_type?(tensor.data_type)
|
299
358
|
execute_2_operand_func('floor_div', tensor, inputs[0], inputs[1], context)
|
@@ -307,7 +366,7 @@ module TensorStream
|
|
307
366
|
execute_cond_func('where', tensor, pred, inputs[0], inputs[1], context)
|
308
367
|
end
|
309
368
|
|
310
|
-
register_op :
|
369
|
+
register_op :mat_mul do |_context, tensor, inputs|
|
311
370
|
a, b = inputs
|
312
371
|
|
313
372
|
m = a.shape[0]
|
@@ -355,7 +414,7 @@ module TensorStream
|
|
355
414
|
end
|
356
415
|
end
|
357
416
|
|
358
|
-
%i[sign exp tan sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
|
417
|
+
%i[sign exp tan acos asin sin cos abs sqrt negate square reciprocal tanh tanh_grad sigmoid log1p round floor ceil].each do |op|
|
359
418
|
register_op op, noop: true do |context, tensor, inputs|
|
360
419
|
execute_func(op.to_s, tensor, inputs[0], context)
|
361
420
|
end
|
@@ -377,6 +436,57 @@ module TensorStream
|
|
377
436
|
output_buffer
|
378
437
|
end
|
379
438
|
|
439
|
+
register_op :log_softmax do |_context, tensor, inputs|
|
440
|
+
a = inputs[0] # logits
|
441
|
+
event_wait_list = [a.op].compact
|
442
|
+
dtype = tensor.data_type
|
443
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
444
|
+
|
445
|
+
m, n = a.shape
|
446
|
+
work_group = [m]
|
447
|
+
n = m if n.nil?
|
448
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
449
|
+
|
450
|
+
event = _cl_program("log_softmax", dtype: dtype).send(:"log_softmax_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
451
|
+
output_buffer.op = event
|
452
|
+
output_buffer
|
453
|
+
end
|
454
|
+
|
455
|
+
register_op :softmax_cross_entropy_with_logits_v2 do |_context, tensor, inputs|
|
456
|
+
a = inputs[0] # logits
|
457
|
+
b = inputs[1] # labels
|
458
|
+
event_wait_list = [a.op, b.op].compact
|
459
|
+
dtype = tensor.data_type
|
460
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
461
|
+
|
462
|
+
m, n = a.shape
|
463
|
+
work_group = [m]
|
464
|
+
n = m if n.nil?
|
465
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
466
|
+
|
467
|
+
event = _cl_program("softmax_cross", dtype: dtype).send(:"softmax_cross_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
468
|
+
output_buffer.op = event
|
469
|
+
output_buffer
|
470
|
+
end
|
471
|
+
|
472
|
+
register_op :softmax_cross_entropy_with_logits_v2_grad do |_context, tensor, inputs|
|
473
|
+
a = inputs[0] # logits
|
474
|
+
b = inputs[1] # labels
|
475
|
+
c = inputs[2] # grads
|
476
|
+
event_wait_list = [a.op, b.op, c.op].compact
|
477
|
+
dtype = tensor.data_type
|
478
|
+
output_buffer = _create_result_buffer(tensor.data_type, a.shape, tensor.name)
|
479
|
+
|
480
|
+
m, n = a.shape
|
481
|
+
work_group = [m]
|
482
|
+
n = m if n.nil?
|
483
|
+
cl_n = OpenCL::Int1.new(n || 1)
|
484
|
+
|
485
|
+
event = _cl_program("softmax_cross_grad", dtype: dtype).send(:"softmax_cross_grad_#{dtype}", _opencl_queue, work_group, cl_n, a.cl_buffer, b.cl_buffer, c.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
486
|
+
output_buffer.op = event
|
487
|
+
output_buffer
|
488
|
+
end
|
489
|
+
|
380
490
|
register_op :softmax_grad do |_context, tensor, inputs|
|
381
491
|
a, grad = inputs
|
382
492
|
|
@@ -417,30 +527,6 @@ module TensorStream
|
|
417
527
|
end
|
418
528
|
end
|
419
529
|
|
420
|
-
register_op :truncate do |_context, tensor, inputs|
|
421
|
-
a, b = inputs
|
422
|
-
if a.shape.size.zero?
|
423
|
-
a
|
424
|
-
else
|
425
|
-
input_b = read_final_result(b)
|
426
|
-
if a.shape == input_b
|
427
|
-
a
|
428
|
-
else
|
429
|
-
input_a = read_final_result(a)
|
430
|
-
if input_b == []
|
431
|
-
if a.buffer.size == 1
|
432
|
-
a.shape = input_b
|
433
|
-
a
|
434
|
-
else
|
435
|
-
wrap_opencl(a.buffer[0], data_type: a.data_type, name: tensor.name)
|
436
|
-
end
|
437
|
-
else
|
438
|
-
wrap_opencl(truncate(input_a, input_b), data_type: a.data_type, name: tensor.name)
|
439
|
-
end
|
440
|
-
end
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
530
|
register_op :print do |context, tensor, inputs|
|
445
531
|
a, b = inputs
|
446
532
|
input_b = complete_eval(b, context)
|
@@ -475,23 +561,27 @@ module TensorStream
|
|
475
561
|
convert_to_opencl(transposed.flatten, transposed.shape.reverse, data_type: inputs[0].data_type, name: tensor.name)
|
476
562
|
end
|
477
563
|
|
478
|
-
register_op :index,
|
479
|
-
a = inputs[0]
|
480
|
-
|
481
|
-
index = read_final_result(inputs[1])
|
564
|
+
register_op :index, noop: true do |context, tensor, inputs|
|
565
|
+
a = _run(inputs[0], context)
|
566
|
+
index = read_final_result(_run(inputs[1], context))
|
482
567
|
|
483
|
-
if a.is_a?(
|
484
|
-
a[index]
|
568
|
+
if a.is_a?(OutputGroup)
|
569
|
+
a.outputs[index]
|
485
570
|
else
|
486
|
-
|
487
|
-
|
488
|
-
|
571
|
+
if a.is_a?(Array)
|
572
|
+
a[index]
|
573
|
+
else
|
574
|
+
new_shape = a.shape.dup
|
575
|
+
new_shape.shift
|
576
|
+
input_a = read_final_result(a)
|
577
|
+
convert_to_opencl(input_a[index], new_shape, data_type: a.data_type, name: tensor.name)
|
578
|
+
end
|
489
579
|
end
|
490
580
|
end
|
491
581
|
|
492
582
|
register_op :broadcast_gradient_args, buffer: true do |_context, tensor, inputs|
|
493
583
|
rx, ry = get_broadcast_gradient_args(inputs[0].buffer.to_a, inputs[1].buffer.to_a)
|
494
|
-
[
|
584
|
+
OutputGroup.new([wrap_opencl(rx, data_type: :int32, name: "#{tensor.name}"), wrap_opencl(ry, data_type: :int32, name: "#{tensor.name}:1")])
|
495
585
|
end
|
496
586
|
|
497
587
|
register_op :shape do |_context, tensor, inputs|
|
@@ -537,6 +627,9 @@ module TensorStream
|
|
537
627
|
|
538
628
|
register_op :argmin, buffer: true do |_context, tensor, inputs|
|
539
629
|
axis = tensor.options[:axis] || 0
|
630
|
+
rank = inputs[0].shape.size
|
631
|
+
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
632
|
+
|
540
633
|
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
541
634
|
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a < b })
|
542
635
|
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
@@ -544,6 +637,9 @@ module TensorStream
|
|
544
637
|
|
545
638
|
register_op :argmax, buffer: true do |_context, tensor, inputs|
|
546
639
|
axis = tensor.options[:axis] || 0
|
640
|
+
rank = inputs[0].shape.size
|
641
|
+
raise TensorStream::InvalidArgumentError, "Expected dimension in the range [#{-rank},#{rank}) but got #{axis}" if axis < -rank || axis >= rank
|
642
|
+
|
547
643
|
arr = inputs[0].buffer.reshape(*inputs[0].shape.reverse).to_a
|
548
644
|
op = get_op_with_axis(arr, axis, 0, inputs[0].data_type, ->(a, b) { a > b })
|
549
645
|
convert_to_opencl(op, shape_eval(op), data_type: tensor.data_type, name: tensor.name)
|
@@ -559,7 +655,7 @@ module TensorStream
|
|
559
655
|
# puts "#{tensor.to_math(true,1)} = #{read_final_result(complete_eval(result, child_context))}"
|
560
656
|
if tensor.breakpoint
|
561
657
|
a = resolve_placeholder(tensor.inputs[0], child_context) if tensor.inputs && tensor.inputs[0]
|
562
|
-
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
658
|
+
b = resolve_placeholder(tensor.inputs[1], child_context) if tensor.inputs && tensor.inputs[1]
|
563
659
|
a = read_final_result(complete_eval(a, child_context))
|
564
660
|
b = read_final_result(complete_eval(b, child_context))
|
565
661
|
result = read_final_result(complete_eval(result, child_context))
|
@@ -581,6 +677,8 @@ module TensorStream
|
|
581
677
|
end
|
582
678
|
rescue EvaluatorExcecutionException => e
|
583
679
|
raise e
|
680
|
+
rescue TensorStreamError => e
|
681
|
+
raise e
|
584
682
|
rescue StandardError => e
|
585
683
|
_opencl_queue.finish # dump queue
|
586
684
|
puts e.message
|
@@ -614,7 +712,8 @@ module TensorStream
|
|
614
712
|
else
|
615
713
|
wrap_opencl(tensor, name: tensor.name)
|
616
714
|
end
|
617
|
-
@context[:_cache][cache_key] =
|
715
|
+
@context[:_cache][cache_key] = @context[cache_key] if tensor.is_const
|
716
|
+
@context[cache_key]
|
618
717
|
end
|
619
718
|
|
620
719
|
private
|
@@ -625,11 +724,11 @@ module TensorStream
|
|
625
724
|
|
626
725
|
if assign.buffer
|
627
726
|
# buffer = type_cast(buffer, assign.data_type, name: "#{tensor.name}/cast_#{tensor.name}_#{tensor.data_type}")
|
628
|
-
if assign.buffer.cl_buffer != buffer.cl_buffer
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
727
|
+
assign.buffer.op = if assign.buffer.cl_buffer != buffer.cl_buffer
|
728
|
+
_opencl_queue.enqueue_copy_buffer(buffer.cl_buffer, assign.buffer.cl_buffer, event_wait_list: [buffer.op, assign.buffer.op])
|
729
|
+
else
|
730
|
+
buffer.op
|
731
|
+
end
|
633
732
|
else
|
634
733
|
value = read_final_result(buffer)
|
635
734
|
assign.buffer = convert_to_opencl(value, buffer.shape, data_type: tensor.data_type, name: assign.name)
|
@@ -660,12 +759,12 @@ module TensorStream
|
|
660
759
|
method_call = :"#{prog}_#{a.data_type}_#{b.data_type}"
|
661
760
|
event = if prog == "#{op_name}_b"
|
662
761
|
cl_m_b, cl_n_b = if b.shape.size == 2
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
762
|
+
[OpenCL::Int1.new(b.shape[0]), OpenCL::Int1.new(b.shape[1])]
|
763
|
+
elsif b.shape.size == 1
|
764
|
+
[OpenCL::Int1.new(1), OpenCL::Int1.new(b.shape[0])]
|
765
|
+
else
|
766
|
+
raise "rank > 2 not supported!"
|
767
|
+
end
|
669
768
|
_cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_m_b, cl_n_b, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
670
769
|
else
|
671
770
|
_cl_program("#{prog_name || op_name}", a: a.data_type, b: b.data_type, dtype: dtype).send(method_call, _opencl_queue, work_group, cl_m, cl_n, cl_switch, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
@@ -691,7 +790,7 @@ module TensorStream
|
|
691
790
|
cl_n = OpenCL::Int1.new(n || 1)
|
692
791
|
|
693
792
|
event_wait_list = [a.op, b.op, p.op].compact # add dependency wait list
|
694
|
-
output_buffer.op = _cl_program(
|
793
|
+
output_buffer.op = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, p.cl_buffer, a.cl_buffer, b.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
695
794
|
output_buffer
|
696
795
|
end
|
697
796
|
|
@@ -706,7 +805,7 @@ module TensorStream
|
|
706
805
|
cl_m = OpenCL::Int1.new(m || 1)
|
707
806
|
cl_n = OpenCL::Int1.new(n || 1)
|
708
807
|
|
709
|
-
event = _cl_program(
|
808
|
+
event = _cl_program(op_name.to_s, dtype: dtype).send(:"#{op_name}_#{dtype}", _opencl_queue, work_group, cl_m, cl_n, a.cl_buffer, output_buffer.cl_buffer, event_wait_list: event_wait_list)
|
710
809
|
output_buffer.op = event
|
711
810
|
output_buffer
|
712
811
|
end
|
@@ -741,60 +840,58 @@ module TensorStream
|
|
741
840
|
|
742
841
|
def wrap_opencl(tensor, data_type: nil, name: nil)
|
743
842
|
value, shape = if tensor.is_a?(Tensor)
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
843
|
+
[tensor.value, tensor.shape.shape]
|
844
|
+
else
|
845
|
+
[tensor, shape_eval(tensor)]
|
846
|
+
end
|
748
847
|
|
749
848
|
convert_to_opencl(value, shape, data_type: data_type || tensor.data_type, name: name)
|
750
849
|
end
|
751
850
|
|
752
851
|
def convert_to_opencl(value, shape, data_type: nil, name: nil)
|
753
|
-
if !value.is_a?(Array) && !value.is_a?(NArray)
|
754
|
-
value = [value]
|
755
|
-
end
|
852
|
+
value = [value] if !value.is_a?(Array) && !value.is_a?(NArray)
|
756
853
|
|
757
854
|
cache_key = "_cl_object_#{name}:#{shape.join('_')}:#{object_id}"
|
758
|
-
cl_object =
|
855
|
+
cl_object = if name && @context[:_cache][cache_key]
|
759
856
|
@context[:_cache][cache_key]
|
760
|
-
|
761
|
-
|
857
|
+
else
|
858
|
+
narray_size = shape.reduce(:*) || 1
|
762
859
|
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
860
|
+
buffer = if value.is_a?(NArray)
|
861
|
+
value
|
862
|
+
else
|
863
|
+
allocate_narray_for_type(data_type, narray_size)
|
864
|
+
end
|
768
865
|
|
769
|
-
|
866
|
+
cl_buffer_size = shape.empty? ? 1 : shape.reduce(:*)
|
770
867
|
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
nil
|
776
|
-
end
|
868
|
+
cl_buffer = unless value.flatten.empty?
|
869
|
+
cl_buffer_size = 1 if cl_buffer_size.zero?
|
870
|
+
_opencl_context.create_buffer(cl_buffer_size * buffer.element_size)
|
871
|
+
end
|
777
872
|
|
778
|
-
|
779
|
-
|
873
|
+
@context[:_cache][cache_key] = OpenCLBuffer.new(name: name, data_type: data_type, shape: shape, buffer: buffer, cl_buffer: cl_buffer)
|
874
|
+
end
|
780
875
|
|
781
876
|
if value.is_a?(Array)
|
782
877
|
value.flatten.each_with_index do |element, index|
|
783
|
-
if element.is_a?(Tensor)
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
878
|
+
cl_object.buffer[index] = if element.is_a?(Tensor)
|
879
|
+
read_final_result(complete_eval(element, {}))
|
880
|
+
elsif data_type == :boolean
|
881
|
+
element ? 1 : 0
|
882
|
+
else
|
883
|
+
Tensor.cast_dtype(element, data_type)
|
884
|
+
end
|
788
885
|
end
|
789
886
|
elsif value.is_a?(NArray)
|
790
887
|
cl_object.buffer = value
|
888
|
+
elsif data_type == :boolean
|
889
|
+
cl_object.buffer[0] = element ? 1 : 0
|
791
890
|
else
|
792
|
-
cl_object.buffer[0] =
|
891
|
+
cl_object.buffer[0] = Tensor.cast_dtype(value, data_type)
|
793
892
|
end
|
794
893
|
|
795
|
-
write_op = if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
796
|
-
_opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer)
|
797
|
-
end
|
894
|
+
write_op = _opencl_queue.enqueue_write_buffer(cl_object.cl_buffer, cl_object.buffer) if cl_object.cl_buffer && !value.nil? && (!value.is_a?(Array) || !value.empty?)
|
798
895
|
cl_object.op = write_op
|
799
896
|
cl_object
|
800
897
|
end
|
@@ -861,7 +958,7 @@ module TensorStream
|
|
861
958
|
|
862
959
|
def _reduced_shape(input_shape, axes)
|
863
960
|
return [] if axes.nil? # reduce to scalar
|
864
|
-
axes = [
|
961
|
+
axes = [axes] unless axes.is_a?(Array)
|
865
962
|
return input_shape if axes.empty?
|
866
963
|
|
867
964
|
axes.each do |dimen|
|
@@ -882,8 +979,7 @@ module TensorStream
|
|
882
979
|
rank = input.shape.size - 1
|
883
980
|
|
884
981
|
if axis.is_a?(Array)
|
885
|
-
axis.map{ |x| rank - x.abs }.sort.
|
886
|
-
|
982
|
+
axis.map { |x| rank - x.abs }.sort.reverse_each do |x|
|
887
983
|
value = value.send(func, x.to_i)
|
888
984
|
end
|
889
985
|
else
|
@@ -891,75 +987,21 @@ module TensorStream
|
|
891
987
|
end
|
892
988
|
|
893
989
|
new_shape = if value.is_a?(NArray)
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
990
|
+
value.shape.reverse
|
991
|
+
else
|
992
|
+
value = [value]
|
993
|
+
[]
|
994
|
+
end
|
899
995
|
|
900
|
-
if tensor.options[:keepdims]
|
901
|
-
new_shape = _reduced_shape(input.shape.dup, axis)
|
902
|
-
end
|
996
|
+
new_shape = _reduced_shape(input.shape.dup, axis) if tensor.options[:keepdims]
|
903
997
|
|
904
998
|
convert_to_opencl(value.flatten, new_shape, data_type: tensor.data_type, name: tensor.name)
|
905
999
|
end
|
906
1000
|
end
|
907
1001
|
|
908
|
-
def arr_pad(arr, paddings, data_type = :float32, rank = 0)
|
909
|
-
raise "padding #{paddings[rank]} needs to have to elements [before, after]" if paddings[rank].size != 2
|
910
|
-
|
911
|
-
before = paddings[rank][0]
|
912
|
-
after = paddings[rank][1]
|
913
|
-
pad_value = fp_type?(data_type) ? 0.0 : 0
|
914
|
-
if arr[0].is_a?(Array)
|
915
|
-
next_dim_elem = arr.collect { |a| arr_pad(a, paddings, data_type, rank + 1) }
|
916
|
-
padding = deep_dup_array(next_dim_elem[0], pad_value)
|
917
|
-
Array.new(before) { padding } + next_dim_elem + Array.new(after) { padding }
|
918
|
-
else
|
919
|
-
Array.new(before) { pad_value } + arr + Array.new(after) { pad_value }
|
920
|
-
end
|
921
|
-
end
|
922
|
-
|
923
|
-
def deep_dup_array(arr, value = nil)
|
924
|
-
if arr.is_a?(Array)
|
925
|
-
arr.dup.collect do |a|
|
926
|
-
deep_dup_array(a, value)
|
927
|
-
end
|
928
|
-
else
|
929
|
-
value.nil? ? arr : value
|
930
|
-
end
|
931
|
-
end
|
932
|
-
|
933
|
-
def matmul_const_transform(mat, mat_b, tensor)
|
934
|
-
if !mat.is_a?(Array)
|
935
|
-
compat_shape = shape_eval(mat_b).reverse
|
936
|
-
func = -> { tensor.data_type == :int32 ? mat.to_i : mat.to_f }
|
937
|
-
|
938
|
-
generate_vector(compat_shape, generator: func)
|
939
|
-
else
|
940
|
-
mat
|
941
|
-
end
|
942
|
-
end
|
943
|
-
|
944
|
-
# determine possible reduction axis to be used
|
945
|
-
def _broadcast_gradient_op(vector_shape1, vector_shape2, level)
|
946
|
-
va_rank = _rank_from_shape(vector_shape1)
|
947
|
-
vb_rank = _rank_from_shape(vector_shape2)
|
948
|
-
return [] if vector_shape1 == vector_shape2 # same shape so no reductions
|
949
|
-
|
950
|
-
shape2_r = vector_shape2.reverse
|
951
|
-
|
952
|
-
vector_shape1.reverse.each_with_index.collect do |s, index|
|
953
|
-
next va_rank - index - 1 if index >= shape2_r.size
|
954
|
-
next nil if shape2_r[index] == s
|
955
|
-
next nil if shape2_r[index] > s
|
956
|
-
va_rank - index - 1
|
957
|
-
end.compact
|
958
|
-
end
|
959
|
-
|
960
1002
|
# selects variants of cl programs depending on input
|
961
1003
|
def select_program(input_a, input_b, op)
|
962
|
-
return [input_a, input_b,
|
1004
|
+
return [input_a, input_b, op.to_s, 0] if input_a.shape == input_b.shape
|
963
1005
|
|
964
1006
|
return [input_b, input_a, "#{op}_c", 1] if input_a.shape.empty? || input_a.shape.reduce(:*) == 1 # A is scalar?
|
965
1007
|
return [input_a, input_b, "#{op}_c", 0] if input_b.shape.empty? || input_a.shape.reduce(:*) == 1 # B is scalar?
|
@@ -979,26 +1021,6 @@ module TensorStream
|
|
979
1021
|
shape.is_a?(Array) ? shape.size : 0
|
980
1022
|
end
|
981
1023
|
|
982
|
-
def concat_array(values, axis)
|
983
|
-
combined_array = values.shift
|
984
|
-
axis = get_rank(combined_array) - 1 if axis == -1
|
985
|
-
|
986
|
-
values.each do |v|
|
987
|
-
combined_array = concat(combined_array, v, axis)
|
988
|
-
end
|
989
|
-
combined_array
|
990
|
-
end
|
991
|
-
|
992
|
-
def concat(a, b, axis)
|
993
|
-
if axis.zero?
|
994
|
-
a + b
|
995
|
-
else
|
996
|
-
a.each_with_index.collect do |i, index|
|
997
|
-
concat(i, b[index], axis - 1)
|
998
|
-
end
|
999
|
-
end
|
1000
|
-
end
|
1001
|
-
|
1002
1024
|
def resolve_placeholder(placeholder, _execution_context = {})
|
1003
1025
|
return nil if placeholder.nil?
|
1004
1026
|
|
@@ -1014,43 +1036,6 @@ module TensorStream
|
|
1014
1036
|
Tensor.cast_dtype(var, placeholder.data_type)
|
1015
1037
|
end
|
1016
1038
|
|
1017
|
-
def reduce_axis(current_axis, axis, val, keep_dims, f = ->(a, b) { a + b })
|
1018
|
-
return val unless val.is_a?(Array)
|
1019
|
-
|
1020
|
-
r = val.collect do |v|
|
1021
|
-
reduce_axis(current_axis + 1, axis, v, keep_dims, f)
|
1022
|
-
end
|
1023
|
-
|
1024
|
-
should_reduce_axis = axis.nil? || (axis.is_a?(Array) && axis.include?(current_axis)) || (current_axis == axis)
|
1025
|
-
|
1026
|
-
if should_reduce_axis
|
1027
|
-
reduced_val = r[0]
|
1028
|
-
if r.size > 1
|
1029
|
-
reduced_val = f.call(r[0..val.size])
|
1030
|
-
elsif r.size.zero?
|
1031
|
-
reduced_val = f.call(nil)
|
1032
|
-
end
|
1033
|
-
keep_dims ? [ reduced_val ] : reduced_val
|
1034
|
-
else
|
1035
|
-
r
|
1036
|
-
end
|
1037
|
-
end
|
1038
|
-
|
1039
|
-
# handle 3 tensor math operations
|
1040
|
-
def call_3way_vector_op(v_a, v_b, v_c, child_context, op = ->(a, b, c) { a + b + c })
|
1041
|
-
return op.call(v_a, v_b, v_c) unless v_a.is_a?(Array)
|
1042
|
-
|
1043
|
-
v_a.each_with_index.collect do |v1, index|
|
1044
|
-
v2 = v_b[index]
|
1045
|
-
v3 = v_c[index]
|
1046
|
-
if v1.is_a?(Array)
|
1047
|
-
call_3way_vector_op(v1, v2, v3, child_context, op)
|
1048
|
-
else
|
1049
|
-
op.call(v1, v2, v3)
|
1050
|
-
end
|
1051
|
-
end
|
1052
|
-
end
|
1053
|
-
|
1054
1039
|
def all_true?(arr)
|
1055
1040
|
if arr.is_a?(Array) || arr.is_a?(NArray)
|
1056
1041
|
arr.each do |a|
|
@@ -1061,58 +1046,8 @@ module TensorStream
|
|
1061
1046
|
|
1062
1047
|
arr != 0
|
1063
1048
|
end
|
1064
|
-
|
1065
|
-
def generate_vector(shape, dtype: :float32, generator:)
|
1066
|
-
if shape.is_a?(Integer)
|
1067
|
-
Array.new(shape) do
|
1068
|
-
generator.call
|
1069
|
-
end
|
1070
|
-
elsif shape.size > 1
|
1071
|
-
Array.new(shape[0]) do
|
1072
|
-
generate_vector(shape[1..shape.size], generator: generator, dtype: dtype)
|
1073
|
-
end
|
1074
|
-
elsif shape.size == 1
|
1075
|
-
Array.new(shape[0]) do
|
1076
|
-
generator.call
|
1077
|
-
end
|
1078
|
-
elsif shape.size.zero?
|
1079
|
-
generator.call
|
1080
|
-
end
|
1081
|
-
end
|
1082
|
-
|
1083
|
-
def _get_randomizer(tensor, seed)
|
1084
|
-
if tensor.graph.random_seed && seed
|
1085
|
-
Random.new(tensor.graph.random_seed ^ seed)
|
1086
|
-
elsif tensor.graph.random_seed
|
1087
|
-
@session.randomizer[tensor.graph.object_id] ||= Random.new(tensor.graph.random_seed)
|
1088
|
-
@session.randomizer[tensor.graph.object_id]
|
1089
|
-
elsif seed
|
1090
|
-
@session.randomizer[tensor.operation] ||= Random.new(seed)
|
1091
|
-
@session.randomizer[tensor.operation]
|
1092
|
-
else
|
1093
|
-
Random.new
|
1094
|
-
end
|
1095
|
-
end
|
1096
|
-
|
1097
|
-
def dump_intermediates
|
1098
|
-
arr = []
|
1099
|
-
arr << "============== start ==================="
|
1100
|
-
@context[:compute_history].each_with_index do |history, index|
|
1101
|
-
arr << "------------------------------------"
|
1102
|
-
arr << history[:name]
|
1103
|
-
arr << "#{history[:type]} #{history[:shape]}"
|
1104
|
-
arr << history[:source]
|
1105
|
-
arr << history[:description]
|
1106
|
-
arr << ""
|
1107
|
-
arr << history[:value].to_json
|
1108
|
-
arr << "------------------------------------"
|
1109
|
-
end
|
1110
|
-
arr << "============== end ====================="
|
1111
|
-
str = arr.join("\n")
|
1112
|
-
File.write("/tmp/intermediates.txt", str)
|
1113
|
-
end
|
1114
1049
|
end
|
1115
1050
|
end
|
1116
1051
|
end
|
1117
1052
|
|
1118
|
-
TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)
|
1053
|
+
TensorStream::Evaluator.register_evaluator(TensorStream::Evaluator::OpenclEvaluator, 'opencl', 1)
|