tensor_stream-opencl 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +11 -4
- data/benchmark/benchmark.rb +91 -0
- data/benchmark_intel.txt +36 -0
- data/lib/tensor_stream/opencl/array_ops.rb +395 -0
- data/lib/tensor_stream/opencl/images_ops.rb +62 -0
- data/lib/tensor_stream/opencl/kernels/abs.cl +6 -8
- data/lib/tensor_stream/opencl/kernels/acos.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_adadelta.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_adagrad.cl +12 -0
- data/lib/tensor_stream/opencl/kernels/apply_adam.cl +2 -5
- data/lib/tensor_stream/opencl/kernels/apply_centered_rms_prop.cl +19 -0
- data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +2 -4
- data/lib/tensor_stream/opencl/kernels/apply_rms_prop.cl +16 -0
- data/lib/tensor_stream/opencl/kernels/asin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/ceil.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/concat.cl +21 -0
- data/lib/tensor_stream/opencl/kernels/cos.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/exp.cl +3 -5
- data/lib/tensor_stream/opencl/kernels/floor.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/log1p.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/negate.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/reciprocal.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sigmoid.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/sign.cl +7 -8
- data/lib/tensor_stream/opencl/kernels/sin.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/split.cl +17 -0
- data/lib/tensor_stream/opencl/kernels/split_n.cl +18 -0
- data/lib/tensor_stream/opencl/kernels/sqrt.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/square.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tan.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +3 -4
- data/lib/tensor_stream/opencl/kernels/unpack.cl +23 -0
- data/lib/tensor_stream/opencl/nn_ops.rb +111 -26
- data/lib/tensor_stream/opencl/opencl_buffer.rb +9 -0
- data/lib/tensor_stream/opencl/opencl_evaluator.rb +129 -172
- data/lib/tensor_stream/opencl/version.rb +1 -1
- data/samples/iris.data +150 -0
- data/samples/iris.rb +110 -0
- data/samples/mnist_data.rb +65 -0
- data/samples/multigpu.rb +73 -0
- data/samples/nearest_neighbor.rb +56 -0
- data/samples/rnn.rb +108 -0
- data/tensor_stream-opencl.gemspec +4 -1
- metadata +62 -3
@@ -0,0 +1,62 @@
|
|
1
|
+
# require 'oily_png'
|
2
|
+
module TensorStream
|
3
|
+
module OpenCLHelpers
|
4
|
+
module ImagesOps
|
5
|
+
def ImagesOps.included(klass)
|
6
|
+
klass.class_eval do
|
7
|
+
register_op :decode_png do |context, tensor, inputs|
|
8
|
+
content = _run(inputs[0], context)
|
9
|
+
channels = tensor.options[:channels]
|
10
|
+
channels = 4 if channels.zero?
|
11
|
+
|
12
|
+
image = ChunkyPNG::Image.from_blob(content.buffer.to_a.pack('C*'))
|
13
|
+
output_buffer = _create_result_buffer(tensor.data_type, [image.height, image.width, channels], "out_#{tensor.name}")
|
14
|
+
|
15
|
+
image.grayscale! if channels == 1
|
16
|
+
image.pixels.each_with_index do |pixel, index|
|
17
|
+
start_index = index * channels
|
18
|
+
if channels == 4
|
19
|
+
output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
|
20
|
+
output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
|
21
|
+
output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
|
22
|
+
output_buffer.buffer[start_index + 3] = ChunkyPNG::Color.a(pixel)
|
23
|
+
elsif channels == 3
|
24
|
+
output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
|
25
|
+
output_buffer.buffer[start_index + 1] = ChunkyPNG::Color.g(pixel)
|
26
|
+
output_buffer.buffer[start_index + 2] = ChunkyPNG::Color.b(pixel)
|
27
|
+
elsif channels == 1
|
28
|
+
output_buffer.buffer[start_index] = ChunkyPNG::Color.r(pixel)
|
29
|
+
else
|
30
|
+
raise "Invalid channel value #{channels}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
write_op = _opencl_queue.enqueue_write_buffer(output_buffer.cl_buffer, output_buffer.buffer)
|
35
|
+
output_buffer.op = write_op
|
36
|
+
output_buffer
|
37
|
+
end
|
38
|
+
|
39
|
+
register_op :encode_png do |_context, tensor, inputs|
|
40
|
+
image_data = inputs[0]
|
41
|
+
height, width, channels = image_data.shape
|
42
|
+
image_buffer = image_data.buffer.reshape(*image_data.shape.reverse).to_a
|
43
|
+
\
|
44
|
+
png = ChunkyPNG::Image.new(width, height)
|
45
|
+
image_buffer.each_with_index do |rows, h_index|
|
46
|
+
rows.each_with_index do |p_data, w_index|
|
47
|
+
if channels == 4
|
48
|
+
png[w_index, h_index] = ChunkyPNG::Color.rgba(p_data[0], p_data[1], p_data[2], p_data[3])
|
49
|
+
elsif channels == 3
|
50
|
+
png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[1], p_data[2])
|
51
|
+
elsif channels == 1
|
52
|
+
png[w_index, h_index] = ChunkyPNG::Color.rgb(p_data[0], p_data[0], p_data[0])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
convert_to_opencl(png.to_s, [], data_type: :string, name: tensor.name)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
% if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
|
3
|
-
__kernel void abs_<%= dtype%>(
|
3
|
+
__kernel void abs_<%= dtype%>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = fabs(A[id]);
|
9
8
|
}
|
10
9
|
% else
|
11
10
|
% %w[int int32].each do |dt|
|
12
|
-
__kernel void abs_<%= dt %>(
|
11
|
+
__kernel void abs_<%= dt %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
13
12
|
// Get the index of the current element to be processed
|
14
|
-
const int
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
13
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
16
14
|
|
17
|
-
C[
|
15
|
+
C[id] = fabs((float)A[id]);
|
18
16
|
}
|
19
17
|
% end
|
20
18
|
%end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void acos_<%= dtype %>(
|
2
|
+
__kernel void acos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = acos(A[id]);
|
8
7
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_adadelta_<%= dtype %>(
|
3
|
+
__kernel void apply_adadelta_<%= dtype %>(
|
4
4
|
__global const <%= c_dtype %> *lr,
|
5
5
|
__global const <%= c_dtype %> *rho,
|
6
6
|
__global const <%= c_dtype %> *epsilon,
|
@@ -10,9 +10,7 @@
|
|
10
10
|
__global <%= c_dtype %> *acc_update
|
11
11
|
) {
|
12
12
|
// Get the index of the current element to be processed
|
13
|
-
const int
|
14
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
15
|
-
const int index = globalRow * N + globalCol;
|
13
|
+
const int index = get_global_id(0);
|
16
14
|
|
17
15
|
acc[index] = acc[index] * rho[0] + (grad[index] * grad[index]) * ((<%= c_dtype %>)1 - rho[0]);
|
18
16
|
const <%= c_dtype %> update = sqrt(acc_update[index] + epsilon[0]) * rsqrt(acc[index] + epsilon[0]) * grad[index];
|
@@ -0,0 +1,12 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_adagrad_<%= dtype %>(
|
4
|
+
__global const <%= c_dtype %> *lr,
|
5
|
+
__global const <%= c_dtype %> *grad,
|
6
|
+
__global <%= c_dtype %> *output,
|
7
|
+
__global <%= c_dtype %> *acc
|
8
|
+
) {
|
9
|
+
// Get the index of the current element to be processed
|
10
|
+
const int index = get_global_id(0);
|
11
|
+
output[index] -= grad[index] * lr[0] * rsqrt(acc[index]);
|
12
|
+
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_adam_<%= dtype %>(
|
3
|
+
__kernel void apply_adam_<%= dtype %>(
|
4
4
|
__global const <%= c_dtype %> *grad,
|
5
5
|
__global const <%= c_dtype %> *learning_rate,
|
6
6
|
__global const <%= c_dtype %> *beta1_power,
|
@@ -11,10 +11,7 @@
|
|
11
11
|
__global <%= c_dtype %> *momentum,
|
12
12
|
__global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
|
13
13
|
// Get the index of the current element to be processed
|
14
|
-
const int
|
15
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
16
|
-
const int index = globalRow * N + globalCol;
|
17
|
-
|
14
|
+
const int index = get_global_id(0);
|
18
15
|
<%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
|
19
16
|
|
20
17
|
momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
|
@@ -0,0 +1,19 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_centered_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
|
4
|
+
__global const <%= c_dtype %> *rho,
|
5
|
+
__global const <%= c_dtype %> *momentum,
|
6
|
+
__global const <%= c_dtype %> *epsilon,
|
7
|
+
__global const <%= c_dtype %> *grad,
|
8
|
+
__global <%= c_dtype %> *output,
|
9
|
+
__global <%= c_dtype %> *ms,
|
10
|
+
__global <%= c_dtype %> *mg,
|
11
|
+
__global <%= c_dtype %> *mom) {
|
12
|
+
// Get the index of the current element to be processed
|
13
|
+
const int id = get_global_id(0);
|
14
|
+
ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
|
15
|
+
<%= c_dtype %> denom = ms[id] - mg[id] * mg[id] + epsilon[0];
|
16
|
+
mg[id] = (grad[id] - mg[id]) * (1.0 - rho[0]);
|
17
|
+
mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(denom);
|
18
|
+
output[id] -= mom[id];
|
19
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_gradient_<%= dtype %>(
|
3
|
+
__kernel void apply_gradient_<%= dtype %>(__global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] -= A[id] * B[0];
|
9
8
|
}
|
@@ -1,11 +1,9 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
// same dimension add floating point op
|
3
|
-
__kernel void apply_momentum_<%= dtype %>(
|
3
|
+
__kernel void apply_momentum_<%= dtype %>(__global const <%= c_dtype %> *grad, __global const <%= c_dtype %> *learning_rate,
|
4
4
|
__global const <%= c_dtype %> *momentum, __global <%= c_dtype %> *output, __global <%= c_dtype %> *acc) {
|
5
5
|
// Get the index of the current element to be processed
|
6
|
-
const int
|
7
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
8
|
-
const int index = globalRow * N + globalCol;
|
6
|
+
const int index = get_global_id(0);
|
9
7
|
<%= c_dtype %> acc_m = acc[index];
|
10
8
|
acc[index] = acc_m * momentum[0] + grad[index];
|
11
9
|
<% if nesterov %>
|
@@ -0,0 +1,16 @@
|
|
1
|
+
% c_dtype = dtype_to_c_type(dtype)
|
2
|
+
// same dimension add floating point op
|
3
|
+
__kernel void apply_rms_prop_<%= dtype %>(__global const <%= c_dtype %> *lr,
|
4
|
+
__global const <%= c_dtype %> *rho,
|
5
|
+
__global const <%= c_dtype %> *momentum,
|
6
|
+
__global const <%= c_dtype %> *epsilon,
|
7
|
+
__global const <%= c_dtype %> *grad,
|
8
|
+
__global <%= c_dtype %> *output,
|
9
|
+
__global <%= c_dtype %> *ms,
|
10
|
+
__global <%= c_dtype %> *mom) {
|
11
|
+
// Get the index of the current element to be processed
|
12
|
+
const int id = get_global_id(0);
|
13
|
+
ms[id] += (grad[id] * grad[id] - ms[id]) * (1.0 - rho[0]);
|
14
|
+
mom[id] = mom[id] * momentum[0] + (grad[id] * lr[0]) / sqrt(ms[id] + epsilon[0]);
|
15
|
+
output[id] -= mom[id];
|
16
|
+
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void asin_<%= dtype %>(
|
3
|
+
__kernel void asin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = asin(A[id]);
|
9
8
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void ceil_<%= dtype %>(
|
2
|
+
__kernel void ceil_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = ceil(A[id]);
|
8
7
|
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
|
3
|
+
__kernel void concat(const int N, const int index, const int step, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
int ptr = globalCol;
|
7
|
+
|
8
|
+
// compute effective coordinates
|
9
|
+
<% divisors.each_with_index do |div, index| %>
|
10
|
+
<% if axis == index %>
|
11
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + step;
|
12
|
+
<% else %>
|
13
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
|
14
|
+
<% end %>
|
15
|
+
<% if index < divisors.size - 1%>
|
16
|
+
ptr = ptr % <%= div %>;
|
17
|
+
<% end %>
|
18
|
+
<% end %>
|
19
|
+
|
20
|
+
C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map_#{idx}" }.join(' + ') %>] = A[globalCol];
|
21
|
+
}
|
@@ -1,8 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void cos_<%= dtype %>(
|
2
|
+
__kernel void cos_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = cos(A[globalRow * N + globalCol]);
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
C[id] = cos(A[id]);
|
8
6
|
}
|
@@ -1,8 +1,6 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void exp_<%= dtype %>(
|
2
|
+
__kernel void exp_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
|
6
|
-
|
7
|
-
C[globalRow * N + globalCol] = exp(A[globalRow * N + globalCol]);
|
4
|
+
const int id = get_global_id(0);
|
5
|
+
C[id] = exp(A[id]);
|
8
6
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void floor_<%= dtype %>(
|
2
|
+
__kernel void floor_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = floor(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log_<%= dtype %>(
|
2
|
+
__kernel void log_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = log(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void log1p_<%= dtype %>(
|
2
|
+
__kernel void log1p_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = log1p(A[id]);
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void negate_<%= dtype %>(
|
2
|
+
__kernel void negate_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = -A[id];
|
8
7
|
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
|
-
__kernel void reciprocal_<%= dtype %>(
|
2
|
+
__kernel void reciprocal_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
3
3
|
// Get the index of the current element to be processed
|
4
|
-
const int
|
5
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
4
|
+
const int id = get_global_id(0);
|
6
5
|
|
7
|
-
C[
|
6
|
+
C[id] = 1 / A[id];
|
8
7
|
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void sigmoid_<%= dtype %>(
|
3
|
+
__kernel void sigmoid_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0);
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = 1.0f/(1.0f + exp(-A[id]));
|
9
8
|
}
|
@@ -1,21 +1,20 @@
|
|
1
1
|
% c_dtype = dtype_to_c_type(dtype)
|
2
2
|
|
3
|
-
__kernel void sign_<%= dtype %>(
|
3
|
+
__kernel void sign_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
|
7
|
-
<%= c_dtype %> value = A[globalRow * N + globalCol];
|
5
|
+
const int id = get_global_id(0);
|
6
|
+
<%= c_dtype %> value = A[id];
|
8
7
|
% if floating_point?(dtype)
|
9
8
|
if (isnan(value) || value == 0.0f) {
|
10
|
-
C[
|
9
|
+
C[id] = 0.0;
|
11
10
|
} else {
|
12
|
-
C[
|
11
|
+
C[id] = value < 0 ? -1.0 : 1.0;
|
13
12
|
}
|
14
13
|
% else
|
15
14
|
if (value == 0) {
|
16
|
-
C[
|
15
|
+
C[id] = 0;
|
17
16
|
} else {
|
18
|
-
C[
|
17
|
+
C[id] = value < 0 ? -1 : 1;
|
19
18
|
}
|
20
19
|
% end
|
21
20
|
}
|
@@ -1,9 +1,8 @@
|
|
1
1
|
|
2
2
|
% c_dtype = dtype_to_c_type(dtype)
|
3
|
-
__kernel void sin_<%= dtype %>(
|
3
|
+
__kernel void sin_<%= dtype %>(__global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
|
4
4
|
// Get the index of the current element to be processed
|
5
|
-
const int
|
6
|
-
const int globalCol = get_global_id(1); // Col ID of C (0..N)
|
5
|
+
const int id = get_global_id(0); // Row ID of C (0..M)
|
7
6
|
|
8
|
-
C[
|
7
|
+
C[id] = sin(A[id]);
|
9
8
|
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
% ctype = dtype_to_c_type(data_type)
|
2
|
+
% mul_str = mul.each_with_index.collect { |mul, index| "#{mul} * index_map_#{index}" }
|
3
|
+
__kernel void split(const int N, __global const <%= ctype %> *A, __global <%= ctype %> *C) {
|
4
|
+
// Get the index of the current element to be processed
|
5
|
+
const int globalCol = get_global_id(0); // Col ID of C (0..N)
|
6
|
+
const int localCol = get_global_id(1);
|
7
|
+
// compute effective coordinates
|
8
|
+
int ptr = localCol;
|
9
|
+
<% dest.each_with_index do |div, index| %>
|
10
|
+
<% if index == axis %>
|
11
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + globalCol * <%= step %>;
|
12
|
+
<% else %>
|
13
|
+
int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>);
|
14
|
+
<% end %>
|
15
|
+
<% if index < dest.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %>
|
16
|
+
C[N*globalCol + localCol] = A[<%= mul_str.join(" + ") %>];
|
17
|
+
}
|