tensor_stream-opencl 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +11 -4
- data/benchmark/benchmark.rb +91 -0
- data/benchmark_intel.txt +36 -0
- data/lib/tensor_stream/opencl/array_ops.rb +395 -0
- data/lib/tensor_stream/opencl/images_ops.rb +62 -0
- data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
- data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
- data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
- data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
- data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
- data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
- data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
- data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
- data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
- data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
- data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.data +150 -0
- data/samples/iris.rb +110 -0
- data/samples/mnist_data.rb +65 -0
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +56 -0
- data/samples/rnn.rb +108 -0
- data/tensor_stream-opencl.gemspec +4 -1
- metadata +62 -3
@@ -0,0 +1,62 @@
|
|
1
|
+
# require 'oily_png'
|
2
|
+
module TensorStream
|
3
|
+
module OpenCLHelpers
|
4
|
+
module ImagesOps
|
5
|
+
def ImagesOps.included(klass)
|
6
|
+
klass.class_eval do
|
7
|
+
register_op :decode_png do |context, tensor, inputs|
|
8
|
+
content = _run(inputs[0], context)
|
9
|
+
channels = tensor.options[:channels]
|
10
|
+
channels = 4 if channels.zero?
|
11
|
+
|
12
|
+
image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
|
13
|
+
output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
|
14
|
+
|
15
|
+
image.grayscale! if channels == 1
|
16
|
+
image.pixels.each_with_index do |pixel, index|
|
17
|
+
start_index = index * channels
|
18
|
+
if channels == 4
|
19
|
+
output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
|
20
|
+
output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
|
21
|
+
output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
|
22
|
+
output_buffer.buffer[start_index + 3] = ChunkyPNG::Color.a(pixel)
|
23
|
+
elsif channels == 3
|
24
|
+
output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
|
25
|
+
output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
|
26
|
+
output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
|
27
|
+
elsif channels == 1
|
28
|
+
output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
|
29
|
+
else
|
30
|
+
raise "Invalid channel value #{channels}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
35
|
+
output_buffer.op = write_op
|
36
|
+
output_buffer
|
37
|
+
end
|
38
|
+
|
39
|
+
register_op :encode_png do |_context, tensor, inputs|
|
40
|
+
image_data = inputs[0]
|
41
|
+
height, width, channels = image_data.shape
|
42
|
+
image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
|
43
|
+
\
|
44
|
+
png = ChunkyPNG::Image.new(width, height)
|
45
|
+
image_buffer.each_with_index do |rows, h_index|
|
46
|
+
rows.each_with_index do |p_data, w_index|
|
47
|
+
if channels == 4
|
48
|
+
png[w_index, h_index] = ChunkyPNG::Color.rgba(p_data[0], p_data[1], p_data[2], p_data[3])
|
49
|
+
elsif channels == 3
|
50
|
+
png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[1], p_data[2])
|
51
|
+
elsif channels == 1
|
52
|
+
png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[0], p_data[0])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
|
3
|
-
__kernel void abs_<%= dtype%>(
|
3
|
+
__kernel void abs_<%= dtype%>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = fabs(A[id]);
|
9
8
|
}
|
10
9
|
% else
|
11
10
|
% %w[int int32].each do |dt|
|
12
|
-
__kernel void abs_<%= dt %>(
|
11
|
+
__kernel void abs_<%= dt %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
13
12
|
// Get the index of the current element to be processed
|
14
|
-
const int
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
16
14
|
|
17
|
-
C[
|
15
|
+
C[id] = fabs((float)A[id]);
|
18
16
|
}
|
19
17
|
% end
|
20
18
|
%end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void acos_<%= dtype %>(
|
2
|
+
__kernel void acos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = acos(A[id]);
|
8
7
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_adadelta_<%= dtype %>(
|
3
|
+
__kernel void apply_adadelta_<%= dtype %>(
|
4
4
|
__global const <%= c_dtype %> *lr,
|
5
5
|
__global const <%= c_dtype %> *rho,
|
6
6
|
__global const <%= c_dtype %> *epsilon,
|
@@ -10,9 +10,7 @@
|
|
10
10
|
__global <%= c_dtype %> *acc_update
|
11
11
|
) {
|
12
12
|
// Get the index of the current element to be processed
|
13
|
-
const int
|
14
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
-
const int index = globalRow * N + globalCol;
|
13
|
+
const int index = get_global_id(0);
|
16
14
|
|
17
15
|
acc[index] = acc[index] * rho[0] + (grad[index] * grad[index]) * ((<%= c_dtype %>)1 - rho[0]);
|
18
16
|
const <%= c_dtype %> update = sqrt(acc_update[index] + epsilon[0]) * rsqrt(acc[index] + epsilon[0]) * grad[index];
|
@@ -0,0 +1,12 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_adagrad_<%= dtype %>(
|
4
|
+
__global const <%= c_dtype %> *lr,
|
5
|
+
__global const <%= c_dtype %> *grad,
|
6
|
+
__global <%= c_dtype %> *output,
|
7
|
+
__global <%= c_dtype %> *acc
|
8
|
+
) {
|
9
|
+
// Get the index of the current element to be processed
|
10
|
+
const int index = get_global_id(0);
|
11
|
+
output[index] -= grad[index] * lr[0] * rsqrt(acc[index]);
|
12
|
+
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_adam_<%= dtype %>(
|
3
|
+
__kernel void apply_adam_<%= dtype %>(
|
4
4
|
__global const <%= c_dtype %> *grad,
|
5
5
|
__global const <%= c_dtype %> *learning_rate,
|
6
6
|
__global const <%= c_dtype %> *beta1_power,
|
@@ -11,10 +11,7 @@
|
|
11
11
|
__global <%= c_dtype %> *momentum,
|
12
12
|
__global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
|
13
13
|
// Get the index of the current element to be processed
|
14
|
-
const int
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
const int index = globalRow * N + globalCol;
|
17
|
-
|
14
|
+
const int index = get_global_id(0);
|
18
15
|
<%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
19
16
|
|
20
17
|
momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
|
@@ -0,0 +1,19 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_centered_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
|
4
|
+
__global const <%= c_dtype %> *rho,
|
5
|
+
__global const <%= c_dtype %> *momentum,
|
6
|
+
__global const <%= c_dtype %> *epsilon,
|
7
|
+
__global const <%= c_dtype %> *grad,
|
8
|
+
__global <%= c_dtype %> *output,
|
9
|
+
__global <%= c_dtype %> *ms,
|
10
|
+
__global <%= c_dtype %> *mg,
|
11
|
+
__global <%= c_dtype %> *mom) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
const int id = get_global_id(0);
|
14
|
+
ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
|
15
|
+
<%= c_dtype %> denom = ms[id] - mg[id] * mg[id] + epsilon[0];
|
16
|
+
mg[id] = (grad[id] - mg[id]) * (1.0 - rho[0]);
|
17
|
+
mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(denom);
|
18
|
+
output[id] -= mom[id];
|
19
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_gradient_<%= dtype %>(
|
3
|
+
__kernel void apply_gradient_<%= dtype %>(__global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] -= A[id] * B[0];
|
9
8
|
}
|
@@ -1,11 +1,9 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_momentum_<%= dtype %>(
|
3
|
+
__kernel void apply_momentum_<%= dtype %>(__global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
4
|
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
5
|
// Get the index of the current element to be processed
|
6
|
-
const int
|
7
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
-
const int index = globalRow * N + globalCol;
|
6
|
+
const int index = get_global_id(0);
|
9
7
|
<%= c_dtype %> acc_m = acc[index];
|
10
8
|
acc[index] = acc_m * momentum[0] + grad[index];
|
11
9
|
<% if nesterov %>
|
@@ -0,0 +1,16 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
|
4
|
+
__global const <%= c_dtype %> *rho,
|
5
|
+
__global const <%= c_dtype %> *momentum,
|
6
|
+
__global const <%= c_dtype %> *epsilon,
|
7
|
+
__global const <%= c_dtype %> *grad,
|
8
|
+
__global <%= c_dtype %> *output,
|
9
|
+
__global <%= c_dtype %> *ms,
|
10
|
+
__global <%= c_dtype %> *mom) {
|
11
|
+
// Get the index of the current element to be processed
|
12
|
+
const int id = get_global_id(0);
|
13
|
+
ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
|
14
|
+
mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(ms[id] + epsilon[0]);
|
15
|
+
output[id] -= mom[id];
|
16
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void asin_<%= dtype %>(
|
3
|
+
__kernel void asin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = asin(A[id]);
|
9
8
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void ceil_<%= dtype %>(
|
2
|
+
__kernel void ceil_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = ceil(A[id]);
|
8
7
|
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void concat(const int N, const int index, const int step, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
int ptr = globalCol;
|
7
|
+
|
8
|
+
// compute effective coordinates
|
9
|
+
<% divisors.each_with_index do |div, index| %>
|
10
|
+
<% if axis == index %>
|
11
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + step;
|
12
|
+
<% else %>
|
13
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
|
14
|
+
<% end %>
|
15
|
+
<% if index < divisors.size - 1%>
|
16
|
+
ptr = ptr % <%= div %>;
|
17
|
+
<% end %>
|
18
|
+
<% end %>
|
19
|
+
|
20
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map_#{idx}" }.join(' + ') %>] = A[globalCol];
|
21
|
+
}
|
@@ -1,8 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void cos_<%= dtype %>(
|
2
|
+
__kernel void cos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
C[id] = cos(A[id]);
|
8
6
|
}
|
@@ -1,8 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void exp_<%= dtype %>(
|
2
|
+
__kernel void exp_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
C[id] = exp(A[id]);
|
8
6
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void floor_<%= dtype %>(
|
2
|
+
__kernel void floor_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = floor(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log_<%= dtype %>(
|
2
|
+
__kernel void log_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = log(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log1p_<%= dtype %>(
|
2
|
+
__kernel void log1p_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = log1p(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void negate_<%= dtype %>(
|
2
|
+
__kernel void negate_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = -A[id];
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void reciprocal_<%= dtype %>(
|
2
|
+
__kernel void reciprocal_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = 1 / A[id];
|
8
7
|
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void sigmoid_<%= dtype %>(
|
3
|
+
__kernel void sigmoid_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = 1.0f/(1.0f + exp(-A[id]));
|
9
8
|
}
|
@@ -1,21 +1,20 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void sign_<%= dtype %>(
|
3
|
+
__kernel void sign_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
|
7
|
-
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
5
|
+
const int id = get_global_id(0);
|
6
|
+
<%= c_dtype %> value = A[id];
|
8
7
|
% if floating_point?(dtype)
|
9
8
|
if (isnan(value) || value == 0.0f) {
|
10
|
-
C[
|
9
|
+
C[id] = 0.0;
|
11
10
|
} else {
|
12
|
-
C[
|
11
|
+
C[id] = value < 0 ? -1.0 : 1.0;
|
13
12
|
}
|
14
13
|
% else
|
15
14
|
if (value == 0) {
|
16
|
-
C[
|
15
|
+
C[id] = 0;
|
17
16
|
} else {
|
18
|
-
C[
|
17
|
+
C[id] = value < 0 ? -1 : 1;
|
19
18
|
}
|
20
19
|
% end
|
21
20
|
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void sin_<%= dtype %>(
|
3
|
+
__kernel void sin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = sin(A[id]);
|
9
8
|
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
% mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
|
3
|
+
__kernel void split(const int N, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
const int localCol = get_global_id(1);
|
7
|
+
// compute effective coordinates
|
8
|
+
int ptr = localCol;
|
9
|
+
<% dest.each_with_index do |div, index| %>
|
10
|
+
<% if index == axis %>
|
11
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + globalCol * <%= step %>;
|
12
|
+
<% else %>
|
13
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
|
14
|
+
<% end %>
|
15
|
+
<% if index < dest.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
16
|
+
C[N*globalCol + localCol] = A[<%= mul_str.join(" + ") %>];
|
17
|
+
}
|