bones-compiler 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
@@ -0,0 +1,72 @@
|
|
1
|
+
|
2
|
+
// Start of the <algorithm_name> kernel (main, not unrolled kernel)
|
3
|
+
__kernel void bones_kernel_<algorithm_name>_0(int bones_input_size, __global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
4
|
+
const int bones_threadblock_work = DIV_CEIL(bones_input_size,get_num_groups(0));
|
5
|
+
const int bones_parallel_work = BONES_MIN(get_local_size(0),bones_threadblock_work);
|
6
|
+
const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
|
7
|
+
const int bones_local_id = get_local_id(0);
|
8
|
+
const int bones_global_id = get_global_id(0);
|
9
|
+
<ids>
|
10
|
+
int bones_iter_id = <in0_flatindex>;
|
11
|
+
|
12
|
+
// Load data into thread private memory and perform the first computation(s) sequentially
|
13
|
+
<in0_type> bones_temporary = <in0_name>[bones_iter_id];
|
14
|
+
<in0_type> bones_private_memory = <algorithm_code3>;
|
15
|
+
for(int c=1; c<bones_sequential_work; c++) {
|
16
|
+
bones_iter_id = bones_iter_id + bones_parallel_work*get_num_groups(0)<factors>;
|
17
|
+
if (bones_iter_id <= <in0_to>) {
|
18
|
+
bones_temporary = <in0_name>[bones_iter_id];
|
19
|
+
bones_private_memory = <algorithm_code1>;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
// Initialize the local memory
|
23
|
+
volatile __local <in0_type> bones_local_memory[256];
|
24
|
+
bones_local_memory[bones_local_id] = bones_private_memory;
|
25
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
26
|
+
|
27
|
+
// Perform the remainder of the computations in parallel using a parallel reduction tree
|
28
|
+
int bones_offset_id;
|
29
|
+
for (int c=256; c>=2; c=c>>1) {
|
30
|
+
if ((2*bones_parallel_work > c) && (get_local_id(0) < c/2)) {
|
31
|
+
bones_offset_id = get_local_id(0)+c/2;
|
32
|
+
if (bones_offset_id < bones_parallel_work) {
|
33
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
37
|
+
}
|
38
|
+
|
39
|
+
// Write the final result back to the global memory
|
40
|
+
if (get_local_id(0) == 0) { <out0_name>[get_group_id(0)] = bones_local_memory[0]; }
|
41
|
+
}
|
42
|
+
|
43
|
+
// Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
|
44
|
+
__kernel void bones_kernel_<algorithm_name>_1(__global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>) {
|
45
|
+
const int bones_local_id = get_local_id(0);
|
46
|
+
const int bones_global_id = get_local_id(0);
|
47
|
+
|
48
|
+
// Initialize the local memory
|
49
|
+
volatile __local <in0_type> bones_local_memory[128];
|
50
|
+
bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
|
51
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
52
|
+
|
53
|
+
// Perform reduction using a parallel reduction tree
|
54
|
+
int bones_offset_id;
|
55
|
+
for (int c=128; c>=2; c=c>>1) {
|
56
|
+
if (get_local_id(0) < c/2) {
|
57
|
+
bones_offset_id = get_local_id(0)+c/2;
|
58
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
59
|
+
}
|
60
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
61
|
+
}
|
62
|
+
|
63
|
+
// Write the final result back to the global memory
|
64
|
+
if (get_local_id(0) == 0) { <out0_name>[0] = bones_local_memory[0]; }
|
65
|
+
}
|
66
|
+
|
67
|
+
// Start of the <algorithm_name> kernel (final, initial value kernel)
|
68
|
+
__kernel void bones_kernel_<algorithm_name>_2(__global <out0_type><out0_devicepointer> bones_initial_value, __global <out0_type><out0_devicepointer> <out0_name>) {
|
69
|
+
<out0_type> bones_private_memory = <out0_name>[0];
|
70
|
+
<out0_type> bones_temporary = bones_initial_value[0];
|
71
|
+
<out0_name>[0] = <algorithm_code4>;
|
72
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
// Create the kernel
|
3
|
+
cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
|
4
|
+
|
5
|
+
// Set all the arguments to the kernel function
|
6
|
+
int bones_num_args = 0;
|
7
|
+
<kernel_argument_list>
|
8
|
+
// Start the kernel
|
9
|
+
size_t bones_global_worksize[] = {DIV_CEIL(<parallelism>,8)*8};
|
10
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize,NULL,0,NULL,&bones_event); error_check(bones_errors);
|
11
|
+
|
12
|
+
// Synchronize and clean-up the kernel
|
13
|
+
clFinish(bones_queue);
|
14
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_0);
|
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
// Start of the <algorithm_name> kernel
|
3
|
+
__kernel void bones_kernel_<algorithm_name>_0(<devicedefinitionsopencl>, <argument_definition>) {
|
4
|
+
const int bones_global_id = get_global_id(0);
|
5
|
+
if (bones_global_id < (<parallelism>)) {
|
6
|
+
|
7
|
+
// Calculate the global ID(s) based on the thread id
|
8
|
+
<ids>
|
9
|
+
|
10
|
+
// Start the computation
|
11
|
+
<algorithm_code1>
|
12
|
+
}
|
13
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
###################################################################
|
2
|
+
# Each line holds one mapping from species to skeleton
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
7
|
+
# '+' denotes one or more of these patterns
|
8
|
+
###################################################################
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
13
|
+
D|chunk(D)+ -> D|element+ :default :00
|
14
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
15
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
16
|
+
N|neighbourhood(N)+ -> N|element+ :default :00
|
17
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
18
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
19
|
+
D|element+ -> D|chunk(D)+ :default :00
|
20
|
+
D|element+ -> D|element+ :default :00
|
21
|
+
D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
|
22
|
+
D|void -> D|element+ :default :00
|
23
|
+
|
24
|
+
#D|element+ -> D|shared+ :default :09
|
25
|
+
#D|element+ -> D|element+ ^ D|shared+ :default :09
|
26
|
+
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include <sys/time.h>
|
6
|
+
#include <CL/cl.h>
|
7
|
+
|
8
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
9
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
10
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
11
|
+
#define DIV_FLOOR(a,b) (a/b)
|
12
|
+
|
13
|
+
// Multiple iterations for kernel measurements
|
14
|
+
#define ITERS 1
|
15
|
+
|
16
|
+
// Load the OpenCL kernel from file
|
17
|
+
char * get_source(const char* bones_filename) {
|
18
|
+
FILE* bones_fp = fopen(bones_filename,"r");
|
19
|
+
fseek(bones_fp,0,SEEK_END);
|
20
|
+
long bones_size = ftell(bones_fp);
|
21
|
+
rewind(bones_fp);
|
22
|
+
char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
|
23
|
+
int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
|
24
|
+
bones_source[bones_size] = '\0';
|
25
|
+
fclose(bones_fp);
|
26
|
+
return bones_source;
|
27
|
+
}
|
28
|
+
|
29
|
+
// Print an error if it occurs
|
30
|
+
void error_check(cl_int bones_errors) {
|
31
|
+
if(bones_errors != CL_SUCCESS) {
|
32
|
+
switch (bones_errors) {
|
33
|
+
case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
|
34
|
+
case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
|
35
|
+
case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
|
36
|
+
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
|
37
|
+
case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
|
38
|
+
case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
|
39
|
+
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
|
40
|
+
case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
|
41
|
+
case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
|
42
|
+
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
|
43
|
+
case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
|
44
|
+
case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
|
45
|
+
case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
|
46
|
+
case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
|
47
|
+
case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
|
48
|
+
case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
|
49
|
+
case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
|
50
|
+
case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
|
51
|
+
case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
|
52
|
+
case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
|
53
|
+
case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
|
54
|
+
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
|
55
|
+
case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
|
56
|
+
case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
|
57
|
+
case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
|
58
|
+
case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
|
59
|
+
case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
|
60
|
+
case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
|
61
|
+
case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
|
62
|
+
case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
|
63
|
+
case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
|
64
|
+
case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
|
65
|
+
case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
|
66
|
+
case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
|
67
|
+
case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
|
68
|
+
case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
|
69
|
+
case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
|
70
|
+
case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
|
71
|
+
case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
|
72
|
+
case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
|
73
|
+
case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
|
74
|
+
case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
|
75
|
+
case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
|
76
|
+
case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
|
77
|
+
case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
|
78
|
+
default: printf("--- Error: Unknown with code %d\n", bones_errors);
|
79
|
+
}
|
80
|
+
fflush(stdout); exit(0);
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
// Use a global variable for the device ID, context and command queue
|
85
|
+
cl_device_id bones_device;
|
86
|
+
cl_context bones_context;
|
87
|
+
cl_command_queue bones_queue;
|
88
|
+
|
89
|
+
// Use a global variable to store the name and the binary for the last program
|
90
|
+
char bones_last_program[1024];
|
91
|
+
cl_program bones_program;
|
92
|
+
|
93
|
+
// Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
|
94
|
+
void bones_initialize_target(void) {
|
95
|
+
cl_int bones_errors;
|
96
|
+
|
97
|
+
// Get OpenCL platform count
|
98
|
+
cl_uint bones_num_platforms;
|
99
|
+
bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
|
100
|
+
if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
|
101
|
+
|
102
|
+
// Get all OpenCL platform IDs
|
103
|
+
cl_platform_id bones_platform_ids[10];
|
104
|
+
bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
|
105
|
+
|
106
|
+
// Select the AMD APP platform
|
107
|
+
char bones_buffer[1024];
|
108
|
+
cl_uint bones_platform;
|
109
|
+
for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
|
110
|
+
clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
|
111
|
+
if(strstr(bones_buffer,"Intel") != NULL) { bones_platform = bones_platform_id; break; }
|
112
|
+
}
|
113
|
+
|
114
|
+
// Get a CPU device on the platform
|
115
|
+
bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_CPU, 1, &bones_device, NULL); error_check(bones_errors);
|
116
|
+
bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
|
117
|
+
|
118
|
+
// Create a context
|
119
|
+
bones_context = clCreateContext(NULL,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
|
120
|
+
|
121
|
+
// Create a command queue
|
122
|
+
bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
|
123
|
+
|
124
|
+
// Create space on the device
|
125
|
+
cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
|
126
|
+
|
127
|
+
// Copy something to the device
|
128
|
+
bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
|
129
|
+
|
130
|
+
// Clean-up the OpenCL context
|
131
|
+
strcpy(bones_last_program,"");
|
132
|
+
clReleaseMemObject(bones_device_data);
|
133
|
+
fflush(stdout);
|
134
|
+
}
|
135
|
+
|
136
|
+
// Declaration of the original function
|
137
|
+
int bones_main(void);
|
138
|
+
|
139
|
+
// New main function for initialisation and clean-up
|
140
|
+
int main(void) {
|
141
|
+
|
142
|
+
// Initialisation
|
143
|
+
bones_initialize_target();
|
144
|
+
|
145
|
+
// Original main function
|
146
|
+
int bones_return = bones_main();
|
147
|
+
|
148
|
+
// Clean-up
|
149
|
+
clReleaseCommandQueue(bones_queue);
|
150
|
+
clReleaseProgram(bones_program);
|
151
|
+
clReleaseContext(bones_context);
|
152
|
+
return bones_return;
|
153
|
+
}
|
154
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
|
3
|
+
// Allocate a 128-byte aligned pointer
|
4
|
+
void *bones_malloc_128(size_t bones_size) {
|
5
|
+
char *bones_pointer;
|
6
|
+
char *bones_pointer2;
|
7
|
+
char *bones_aligned_pointer;
|
8
|
+
|
9
|
+
// Allocate the memory plus a little bit extra
|
10
|
+
bones_pointer = (char *)malloc(bones_size + 128 + sizeof(int));
|
11
|
+
if(bones_pointer==NULL) { return(NULL); }
|
12
|
+
|
13
|
+
// Create the aligned pointer
|
14
|
+
bones_pointer2 = bones_pointer + sizeof(int);
|
15
|
+
bones_aligned_pointer = bones_pointer2 + (128 - ((size_t)bones_pointer2 & 127));
|
16
|
+
|
17
|
+
// Set the padding size
|
18
|
+
bones_pointer2 = bones_aligned_pointer - sizeof(int);
|
19
|
+
*((int *)bones_pointer2) = (int)(bones_aligned_pointer - bones_pointer);
|
20
|
+
|
21
|
+
// Return the 128-byte aligned pointer
|
22
|
+
return (bones_aligned_pointer);
|
23
|
+
}
|
24
|
+
|
25
|
+
// Free the 128-byte aligned pointer
|
26
|
+
void bones_free_128(void *bones_pointer) {
|
27
|
+
int *bones_pointer2 = (int *)bones_pointer - 1;
|
28
|
+
bones_pointer = (char *)bones_pointer - *bones_pointer2;
|
29
|
+
free(bones_pointer);
|
30
|
+
}
|
31
|
+
|
@@ -0,0 +1,5 @@
|
|
1
|
+
|
2
|
+
// Perform a zero-copy of <array> from device to host
|
3
|
+
void* bones_pointer_to_<array> = clEnqueueMapBuffer(bones_queue,device_<array>,CL_TRUE,CL_MAP_READ,<offset>,<variable_dimensions>*sizeof(<type>),0,NULL,NULL,&bones_errors); error_check(bones_errors);
|
4
|
+
clEnqueueUnmapMemObject(bones_queue,device_<array>,bones_pointer_to_<array>,0,NULL,NULL);
|
5
|
+
clFinish(bones_queue);
|
@@ -0,0 +1,4 @@
|
|
1
|
+
|
2
|
+
// Create a device pointer for <array> (zero-copy)
|
3
|
+
cl_mem device_<array> = clCreateBuffer(bones_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, <variable_dimensions>*sizeof(<type>), <array><flatten>, &bones_errors); error_check(bones_errors);
|
4
|
+
//cl_mem device_<array> = clCreateBuffer(bones_context, CL_MEM_READ_WRITE, <variable_dimensions>*sizeof(<type>), NULL, &bones_errors); error_check(bones_errors);
|
@@ -0,0 +1,24 @@
|
|
1
|
+
fflush(stdout);
|
2
|
+
cl_int bones_errors;
|
3
|
+
cl_event bones_event;
|
4
|
+
|
5
|
+
// Only compile if this program is different from the last one
|
6
|
+
if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
|
7
|
+
strcpy(bones_last_program,"<algorithm_filename>");
|
8
|
+
|
9
|
+
// Load and compile the kernel
|
10
|
+
char *bones_source = get_source("<algorithm_filename>_device.cl");
|
11
|
+
bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
|
12
|
+
bones_errors = clBuildProgram(bones_program,0,NULL,"",NULL,NULL);
|
13
|
+
|
14
|
+
// Get and print the compiler log
|
15
|
+
char* bones_log;
|
16
|
+
size_t bones_log_size;
|
17
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
|
18
|
+
bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
|
19
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
|
20
|
+
bones_log[bones_log_size] = '\0';
|
21
|
+
if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
|
22
|
+
free(bones_log);
|
23
|
+
error_check(bones_errors);
|
24
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
3
|
+
#if (ITERS == 1)
|
4
|
+
clFinish(bones_queue);
|
5
|
+
struct timeval bones_end_time1;
|
6
|
+
gettimeofday(&bones_end_time1, NULL);
|
7
|
+
float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
9
|
+
#endif
|
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
// Start the timer for the measurement of the kernel execution time
|
3
|
+
clFinish(bones_queue);
|
4
|
+
for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
|
5
|
+
|
6
|
+
// Flush the CPU cache (for measurement purposes only)
|
7
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
8
|
+
int bones_flush_i;
|
9
|
+
int bones_flush_j;
|
10
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
11
|
+
for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
|
12
|
+
for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
|
13
|
+
bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
|
14
|
+
}
|
15
|
+
}
|
16
|
+
free(bones_flush_c);
|
@@ -0,0 +1,11 @@
|
|
1
|
+
|
2
|
+
}
|
3
|
+
|
4
|
+
// Stop the timer for the measurement of the kernel execution time
|
5
|
+
clFinish(bones_queue);
|
6
|
+
cl_ulong end2, start2;
|
7
|
+
bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
|
8
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
|
9
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
|
10
|
+
float bones_timer2 = 0.000001 * (end2-start2);
|
11
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
// Store the initial value
|
3
|
+
cl_mem bones_initial_value = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(<out0_type>),<out0_name>,&bones_errors); error_check(bones_errors);
|
4
|
+
|
5
|
+
// Create the kernels
|
6
|
+
cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
|
7
|
+
cl_kernel bones_kernel_<algorithm_name>_1 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_1", &bones_errors); error_check(bones_errors);
|
8
|
+
cl_kernel bones_kernel_<algorithm_name>_2 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_2", &bones_errors); error_check(bones_errors);
|
9
|
+
|
10
|
+
// Run either one kernel or multiple kernels
|
11
|
+
if (<in0_dimensions> <= 512) {
|
12
|
+
|
13
|
+
// Set all the arguments to the kernel function
|
14
|
+
int bones_num_args = 3;
|
15
|
+
int bones_dimensions = <in0_dimensions>;
|
16
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
|
17
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
|
18
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
19
|
+
<kernel_argument_list_constants>
|
20
|
+
// Start only one kernel
|
21
|
+
const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
|
22
|
+
size_t bones_local_worksize1[] = {bones_num_threads};
|
23
|
+
size_t bones_global_worksize1[] = {bones_num_threads};
|
24
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
|
25
|
+
|
26
|
+
}
|
27
|
+
else {
|
28
|
+
|
29
|
+
// Allocate space for an intermediate array
|
30
|
+
cl_mem bones_device_temp = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,128*sizeof(<out0_type>),NULL,&bones_errors); error_check(bones_errors);
|
31
|
+
|
32
|
+
// Set all the arguments to the kernel function
|
33
|
+
int bones_num_args = 3;
|
34
|
+
int bones_dimensions = <in0_dimensions>;
|
35
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
|
36
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
|
37
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(bones_device_temp),(void*)&bones_device_temp);
|
38
|
+
<kernel_argument_list_constants>
|
39
|
+
// Start the first kernel
|
40
|
+
size_t bones_local_worksize1[] = {256};
|
41
|
+
size_t bones_global_worksize1[] = {256*128};
|
42
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
|
43
|
+
|
44
|
+
// Set all the arguments to the kernel function
|
45
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_1,0,sizeof(bones_device_temp),(void*)&bones_device_temp);
|
46
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_1,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
47
|
+
// Start the second kernel
|
48
|
+
size_t bones_local_worksize2[] = {128};
|
49
|
+
size_t bones_global_worksize2[] = {128};
|
50
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_1,1,NULL,bones_global_worksize2,bones_local_worksize2,0,NULL,&bones_event); error_check(bones_errors);
|
51
|
+
clReleaseMemObject(bones_device_temp);
|
52
|
+
}
|
53
|
+
|
54
|
+
// Set all the arguments to the kernel function
|
55
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_2,0,sizeof(bones_initial_value),(void*)&bones_initial_value);
|
56
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_2,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
57
|
+
// Perform the last computation (only needed if there is an initial value)
|
58
|
+
size_t bones_local_worksize3[] = {1};
|
59
|
+
size_t bones_global_worksize3[] = {1};
|
60
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_2,1,NULL,bones_global_worksize3,bones_local_worksize3,0,NULL,&bones_event); error_check(bones_errors);
|
61
|
+
clReleaseMemObject(bones_initial_value);
|
62
|
+
|
63
|
+
// Synchronize and clean-up the kernels
|
64
|
+
clFinish(bones_queue);
|
65
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_0);
|
66
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_1);
|
67
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_2);
|