bones-compiler 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
#define SHUFFLE_X 16
|
5
|
+
#define SHUFFLE_Y 16
|
6
|
+
|
7
|
+
// Start of the <algorithm_name> kernel
|
8
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
9
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
10
|
+
if (bones_global_id < (<parallelism>)) {
|
11
|
+
|
12
|
+
// Calculate the global ID(s) based on the thread id
|
13
|
+
<ids>
|
14
|
+
|
15
|
+
// Start the computation
|
16
|
+
<algorithm_code1>
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
// Start of the <algorithm_name> kernel (pre-kernel for shuffling)
|
21
|
+
__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
|
22
|
+
const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
|
23
|
+
const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
|
24
|
+
|
25
|
+
// Set-up the local memory for shuffling
|
26
|
+
__shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
|
27
|
+
|
28
|
+
// Swap the x and y coordinates to perform the rotation (coalesced)
|
29
|
+
if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
|
30
|
+
buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
|
31
|
+
}
|
32
|
+
|
33
|
+
// Synchronize all threads in the threadblock
|
34
|
+
__syncthreads();
|
35
|
+
|
36
|
+
// We don't have to swap the x and y thread indices here, because that's already done in the local memory
|
37
|
+
const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
|
38
|
+
const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
|
39
|
+
|
40
|
+
// Store the shuffled result (coalesced)
|
41
|
+
if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
|
42
|
+
shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
// Function to start the kernel
|
47
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
48
|
+
int bones_block_size;
|
49
|
+
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
50
|
+
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
51
|
+
else if (<parallelism> >= 64*128) { bones_block_size = 128;}
|
52
|
+
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
53
|
+
else { bones_block_size = 32; }
|
54
|
+
|
55
|
+
// First perform some pre-shuffling
|
56
|
+
<in0_type>* shuffled_<in0_name> = 0;
|
57
|
+
cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
|
58
|
+
dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
|
59
|
+
dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
|
60
|
+
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
61
|
+
<in0_type>* temp_<in0_name> = <in0_name>;
|
62
|
+
<in0_name> = shuffled_<in0_name>;
|
63
|
+
cudaFree(temp_<in0_name>);
|
64
|
+
|
65
|
+
// Then run the original kernel
|
66
|
+
dim3 bones_threads0(bones_block_size);
|
67
|
+
dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
|
68
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
|
69
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
// Start of the <algorithm_name> kernel
|
5
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
6
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
7
|
+
int bones_local_id = threadIdx.x;
|
8
|
+
if (bones_global_id < <in0_dimensions>) {
|
9
|
+
|
10
|
+
// Calculate the local and global ID(s) based on the thread id
|
11
|
+
int bones_local_id_0 = bones_local_id;
|
12
|
+
<out0_ids>
|
13
|
+
|
14
|
+
// Load the input data into local memory
|
15
|
+
__shared__ <in0_type> bones_local_memory_<in0_name>[512+<in0_parameter0_sum>];
|
16
|
+
bones_local_id_0 = bones_local_id_0-(<in0_parameter0_from>);
|
17
|
+
bones_local_memory_<in0_name>[bones_local_id_0] = <in0_name>[bones_global_id_0];
|
18
|
+
|
19
|
+
// Load the left border into local memory
|
20
|
+
if (threadIdx.x < -(<in0_parameter0_from>)) {
|
21
|
+
bones_local_memory_<in0_name>[bones_local_id_0+<in0_parameter0_from>] = <in0_name>[bones_global_id_0+<in0_parameter0_from>];
|
22
|
+
}
|
23
|
+
|
24
|
+
// Load the right border into local memory
|
25
|
+
if ((threadIdx.x >= 512-<in0_parameter0_to>) || (bones_global_id_0 >= <in0_dimensions>-<in0_parameter0_to>)) {
|
26
|
+
bones_local_memory_<in0_name>[bones_local_id_0+<in0_parameter0_to>] = <in0_name>[bones_global_id_0+<in0_parameter0_to>];
|
27
|
+
}
|
28
|
+
|
29
|
+
// Synchronize all the threads in a threadblock
|
30
|
+
__syncthreads();
|
31
|
+
|
32
|
+
// Perform the main computation
|
33
|
+
<algorithm_code1>
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
// Function to start the kernel
|
38
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
39
|
+
dim3 bones_threads(512);
|
40
|
+
dim3 bones_grid(DIV_CEIL(<in0_dimensions>,512));
|
41
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
|
42
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
// Start of the <algorithm_name> kernel
|
5
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
6
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
7
|
+
if (bones_global_id < (<parallelism>)) {
|
8
|
+
|
9
|
+
// Calculate the global ID(s) based on the thread id
|
10
|
+
<ids>
|
11
|
+
|
12
|
+
// Start the computation
|
13
|
+
<algorithm_code1>
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
// Function to start the kernel
|
18
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
19
|
+
int bones_block_size;
|
20
|
+
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
21
|
+
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
22
|
+
else if (<parallelism> >= 64*128) { bones_block_size = 128;}
|
23
|
+
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
24
|
+
else { bones_block_size = 32; }
|
25
|
+
dim3 bones_threads(bones_block_size);
|
26
|
+
dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
|
27
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
|
28
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
###################################################################
|
2
|
+
# Each line holds one mapping from species to skeleton
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
7
|
+
# '+' denotes one or more of these patterns
|
8
|
+
###################################################################
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
13
|
+
N,N|chunk(1,N) ^ N,N|chunk(1,N)+ -> D|element+ :2xN-N-chunk-1-N-to-D-element :30
|
14
|
+
N,N|chunk(1,N) ^ N,N|chunk(1,N)+ ^ D|element+ -> D|element+ :2xN-N-chunk-1-N-to-D-element :30
|
15
|
+
N,N|chunk(1,N)+ -> D|element+ :N-N-chunk-1-N-to-D-element :20
|
16
|
+
N,N|chunk(1,N)+ ^ D|element+ -> D|element+ :N-N-chunk-1-N-to-D-element :20
|
17
|
+
N,N|chunk(D)+ -> N,N|element+ :default :40
|
18
|
+
N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :default :40
|
19
|
+
D|chunk(D)+ -> D|element+ :default :00
|
20
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
21
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
22
|
+
N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
|
23
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
24
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
25
|
+
D|element+ -> D|chunk(D)+ :default :00
|
26
|
+
D|element+ -> D|element+ :default :00
|
27
|
+
D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
|
28
|
+
D|element+ -> D|shared+ :default :08
|
29
|
+
D|element+ -> D|element+ ^ D|shared+ :default :08
|
30
|
+
D|void -> D|element+ :default :00
|
@@ -0,0 +1,155 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include <sys/time.h>
|
6
|
+
#include <CL/cl.h>
|
7
|
+
|
8
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
9
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
10
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
11
|
+
#define DIV_FLOOR(a,b) (a/b)
|
12
|
+
|
13
|
+
// Multiple iterations for kernel measurements
|
14
|
+
#define ITERS 1
|
15
|
+
|
16
|
+
// Load the OpenCL kernel from file
|
17
|
+
char * get_source(const char* bones_filename) {
|
18
|
+
FILE* bones_fp = fopen(bones_filename,"r");
|
19
|
+
fseek(bones_fp,0,SEEK_END);
|
20
|
+
long bones_size = ftell(bones_fp);
|
21
|
+
rewind(bones_fp);
|
22
|
+
char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
|
23
|
+
int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
|
24
|
+
bones_source[bones_size] = '\0';
|
25
|
+
fclose(bones_fp);
|
26
|
+
return bones_source;
|
27
|
+
}
|
28
|
+
|
29
|
+
// Print an error if it occurs
|
30
|
+
void error_check(cl_int bones_errors) {
|
31
|
+
if(bones_errors != CL_SUCCESS) {
|
32
|
+
switch (bones_errors) {
|
33
|
+
case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
|
34
|
+
case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
|
35
|
+
case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
|
36
|
+
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
|
37
|
+
case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
|
38
|
+
case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
|
39
|
+
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
|
40
|
+
case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
|
41
|
+
case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
|
42
|
+
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
|
43
|
+
case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
|
44
|
+
case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
|
45
|
+
case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
|
46
|
+
case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
|
47
|
+
case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
|
48
|
+
case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
|
49
|
+
case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
|
50
|
+
case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
|
51
|
+
case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
|
52
|
+
case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
|
53
|
+
case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
|
54
|
+
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
|
55
|
+
case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
|
56
|
+
case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
|
57
|
+
case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
|
58
|
+
case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
|
59
|
+
case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
|
60
|
+
case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
|
61
|
+
case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
|
62
|
+
case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
|
63
|
+
case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
|
64
|
+
case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
|
65
|
+
case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
|
66
|
+
case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
|
67
|
+
case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
|
68
|
+
case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
|
69
|
+
case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
|
70
|
+
case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
|
71
|
+
case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
|
72
|
+
case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
|
73
|
+
case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
|
74
|
+
case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
|
75
|
+
case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
|
76
|
+
case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
|
77
|
+
case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
|
78
|
+
default: printf("--- Error: Unknown with code %d\n", bones_errors);
|
79
|
+
}
|
80
|
+
fflush(stdout); exit(0);
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
// Use a global variable for the device ID, context and command queue
|
85
|
+
cl_device_id bones_device;
|
86
|
+
cl_context bones_context;
|
87
|
+
cl_command_queue bones_queue;
|
88
|
+
|
89
|
+
// Use a global variable to store the name and the binary for the last program
|
90
|
+
char bones_last_program[1024];
|
91
|
+
cl_program bones_program;
|
92
|
+
|
93
|
+
// Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
|
94
|
+
void bones_initialize_target(void) {
|
95
|
+
cl_int bones_errors;
|
96
|
+
|
97
|
+
// Get OpenCL platform count
|
98
|
+
cl_uint bones_num_platforms;
|
99
|
+
bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
|
100
|
+
if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
|
101
|
+
|
102
|
+
// Get all OpenCL platform IDs
|
103
|
+
cl_platform_id bones_platform_ids[10];
|
104
|
+
bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
|
105
|
+
|
106
|
+
// Select the AMD APP platform
|
107
|
+
char bones_buffer[1024];
|
108
|
+
cl_uint bones_platform;
|
109
|
+
for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
|
110
|
+
clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
|
111
|
+
if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
|
112
|
+
}
|
113
|
+
|
114
|
+
// Get a CPU device on the platform
|
115
|
+
bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_GPU, 1, &bones_device, NULL); error_check(bones_errors);
|
116
|
+
bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
|
117
|
+
|
118
|
+
// Create a context
|
119
|
+
bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
|
120
|
+
|
121
|
+
// Create a command queue
|
122
|
+
bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
|
123
|
+
|
124
|
+
// Create space on the device
|
125
|
+
cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
|
126
|
+
|
127
|
+
// Copy something to the device
|
128
|
+
bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
|
129
|
+
|
130
|
+
// Clean-up the OpenCL context
|
131
|
+
strcpy(bones_last_program,"");
|
132
|
+
clReleaseMemObject(bones_device_data);
|
133
|
+
clReleaseContext(bones_context);
|
134
|
+
fflush(stdout);
|
135
|
+
}
|
136
|
+
|
137
|
+
// Declaration of the original function
|
138
|
+
int bones_main(void);
|
139
|
+
|
140
|
+
// New main function for initialisation and clean-up
|
141
|
+
int main(void) {
|
142
|
+
|
143
|
+
// Initialisation
|
144
|
+
bones_initialize_target();
|
145
|
+
|
146
|
+
// Original main function
|
147
|
+
int bones_return = bones_main();
|
148
|
+
|
149
|
+
// Clean-up
|
150
|
+
clReleaseCommandQueue(bones_queue);
|
151
|
+
clReleaseProgram(bones_program);
|
152
|
+
clReleaseContext(bones_context);
|
153
|
+
return bones_return;
|
154
|
+
}
|
155
|
+
|
File without changes
|
@@ -0,0 +1,24 @@
|
|
1
|
+
fflush(stdout);
|
2
|
+
cl_int bones_errors;
|
3
|
+
cl_event bones_event;
|
4
|
+
|
5
|
+
// Only compile if this program is different from the last one
|
6
|
+
if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
|
7
|
+
strcpy(bones_last_program,"<algorithm_filename>");
|
8
|
+
|
9
|
+
// Load and compile the kernel
|
10
|
+
char *bones_source = get_source("<algorithm_filename>_device.cl");
|
11
|
+
bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
|
12
|
+
bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
|
13
|
+
|
14
|
+
// Get and print the compiler log
|
15
|
+
char* bones_log;
|
16
|
+
size_t bones_log_size;
|
17
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
|
18
|
+
bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
|
19
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
|
20
|
+
bones_log[bones_log_size] = '\0';
|
21
|
+
//if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
|
22
|
+
free(bones_log);
|
23
|
+
error_check(bones_errors);
|
24
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
3
|
+
#if (ITERS == 1)
|
4
|
+
clFinish(bones_queue);
|
5
|
+
struct timeval bones_end_time1;
|
6
|
+
gettimeofday(&bones_end_time1, NULL);
|
7
|
+
float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
9
|
+
#endif
|
@@ -0,0 +1,11 @@
|
|
1
|
+
|
2
|
+
}
|
3
|
+
|
4
|
+
// Stop the timer for the measurement of the kernel execution time
|
5
|
+
clFinish(bones_queue);
|
6
|
+
cl_ulong end2, start2;
|
7
|
+
bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
|
8
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
|
9
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
|
10
|
+
float bones_timer2 = 0.000001 * (end2-start2);
|
11
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|