bones-compiler 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
#define SHUFFLE_X 16
|
|
5
|
+
#define SHUFFLE_Y 16
|
|
6
|
+
|
|
7
|
+
// Start of the <algorithm_name> kernel
|
|
8
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
9
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
|
10
|
+
if (bones_global_id < (<parallelism>)) {
|
|
11
|
+
|
|
12
|
+
// Calculate the global ID(s) based on the thread id
|
|
13
|
+
<ids>
|
|
14
|
+
|
|
15
|
+
// Start the computation
|
|
16
|
+
<algorithm_code1>
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Start of the <algorithm_name> kernel (pre-kernel for shuffling)
|
|
21
|
+
__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
|
|
22
|
+
const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
|
|
23
|
+
const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
|
|
24
|
+
|
|
25
|
+
// Set-up the local memory for shuffling
|
|
26
|
+
__shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
|
|
27
|
+
|
|
28
|
+
// Swap the x and y coordinates to perform the rotation (coalesced)
|
|
29
|
+
if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
|
|
30
|
+
buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Synchronize all threads in the threadblock
|
|
34
|
+
__syncthreads();
|
|
35
|
+
|
|
36
|
+
// We don't have to swap the x and y thread indices here, because that's already done in the local memory
|
|
37
|
+
const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
|
|
38
|
+
const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
|
|
39
|
+
|
|
40
|
+
// Store the shuffled result (coalesced)
|
|
41
|
+
if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
|
|
42
|
+
shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Function to start the kernel
|
|
47
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
48
|
+
int bones_block_size;
|
|
49
|
+
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
|
50
|
+
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
|
51
|
+
else if (<parallelism> >= 64*128) { bones_block_size = 128;}
|
|
52
|
+
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
|
53
|
+
else { bones_block_size = 32; }
|
|
54
|
+
|
|
55
|
+
// First perform some pre-shuffling
|
|
56
|
+
<in0_type>* shuffled_<in0_name> = 0;
|
|
57
|
+
cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
|
|
58
|
+
dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
|
|
59
|
+
dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
|
|
60
|
+
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
|
61
|
+
<in0_type>* temp_<in0_name> = <in0_name>;
|
|
62
|
+
<in0_name> = shuffled_<in0_name>;
|
|
63
|
+
cudaFree(temp_<in0_name>);
|
|
64
|
+
|
|
65
|
+
// Then run the original kernel
|
|
66
|
+
dim3 bones_threads0(bones_block_size);
|
|
67
|
+
dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
|
|
68
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
|
|
69
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
// Start of the <algorithm_name> kernel
|
|
5
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
6
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
|
7
|
+
int bones_local_id = threadIdx.x;
|
|
8
|
+
if (bones_global_id < <in0_dimensions>) {
|
|
9
|
+
|
|
10
|
+
// Calculate the local and global ID(s) based on the thread id
|
|
11
|
+
int bones_local_id_0 = bones_local_id;
|
|
12
|
+
<out0_ids>
|
|
13
|
+
|
|
14
|
+
// Load the input data into local memory
|
|
15
|
+
__shared__ <in0_type> bones_local_memory_<in0_name>[512+<in0_parameter0_sum>];
|
|
16
|
+
bones_local_id_0 = bones_local_id_0-(<in0_parameter0_from>);
|
|
17
|
+
bones_local_memory_<in0_name>[bones_local_id_0] = <in0_name>[bones_global_id_0];
|
|
18
|
+
|
|
19
|
+
// Load the left border into local memory
|
|
20
|
+
if (threadIdx.x < -(<in0_parameter0_from>)) {
|
|
21
|
+
bones_local_memory_<in0_name>[bones_local_id_0+<in0_parameter0_from>] = <in0_name>[bones_global_id_0+<in0_parameter0_from>];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Load the right border into local memory
|
|
25
|
+
if ((threadIdx.x >= 512-<in0_parameter0_to>) || (bones_global_id_0 >= <in0_dimensions>-<in0_parameter0_to>)) {
|
|
26
|
+
bones_local_memory_<in0_name>[bones_local_id_0+<in0_parameter0_to>] = <in0_name>[bones_global_id_0+<in0_parameter0_to>];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Synchronize all the threads in a threadblock
|
|
30
|
+
__syncthreads();
|
|
31
|
+
|
|
32
|
+
// Perform the main computation
|
|
33
|
+
<algorithm_code1>
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Function to start the kernel
|
|
38
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
39
|
+
dim3 bones_threads(512);
|
|
40
|
+
dim3 bones_grid(DIV_CEIL(<in0_dimensions>,512));
|
|
41
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
|
|
42
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
// Start of the <algorithm_name> kernel
|
|
5
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
6
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
|
7
|
+
if (bones_global_id < (<parallelism>)) {
|
|
8
|
+
|
|
9
|
+
// Calculate the global ID(s) based on the thread id
|
|
10
|
+
<ids>
|
|
11
|
+
|
|
12
|
+
// Start the computation
|
|
13
|
+
<algorithm_code1>
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Function to start the kernel
|
|
18
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
19
|
+
int bones_block_size;
|
|
20
|
+
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
|
21
|
+
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
|
22
|
+
else if (<parallelism> >= 64*128) { bones_block_size = 128;}
|
|
23
|
+
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
|
24
|
+
else { bones_block_size = 32; }
|
|
25
|
+
dim3 bones_threads(bones_block_size);
|
|
26
|
+
dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
|
|
27
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
|
|
28
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
###################################################################
|
|
2
|
+
# Each line holds one mapping from species to skeleton
|
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
|
7
|
+
# '+' denotes one or more of these patterns
|
|
8
|
+
###################################################################
|
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
13
|
+
N,N|chunk(1,N) ^ N,N|chunk(1,N)+ -> D|element+ :2xN-N-chunk-1-N-to-D-element :30
|
|
14
|
+
N,N|chunk(1,N) ^ N,N|chunk(1,N)+ ^ D|element+ -> D|element+ :2xN-N-chunk-1-N-to-D-element :30
|
|
15
|
+
N,N|chunk(1,N)+ -> D|element+ :N-N-chunk-1-N-to-D-element :20
|
|
16
|
+
N,N|chunk(1,N)+ ^ D|element+ -> D|element+ :N-N-chunk-1-N-to-D-element :20
|
|
17
|
+
N,N|chunk(D)+ -> N,N|element+ :default :40
|
|
18
|
+
N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :default :40
|
|
19
|
+
D|chunk(D)+ -> D|element+ :default :00
|
|
20
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
21
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
|
22
|
+
N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
|
|
23
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
|
24
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
25
|
+
D|element+ -> D|chunk(D)+ :default :00
|
|
26
|
+
D|element+ -> D|element+ :default :00
|
|
27
|
+
D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
|
|
28
|
+
D|element+ -> D|shared+ :default :08
|
|
29
|
+
D|element+ -> D|element+ ^ D|shared+ :default :08
|
|
30
|
+
D|void -> D|element+ :default :00
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#include <string.h>
|
|
2
|
+
#include <stdio.h>
|
|
3
|
+
#include <stdlib.h>
|
|
4
|
+
#include <math.h>
|
|
5
|
+
#include <sys/time.h>
|
|
6
|
+
#include <CL/cl.h>
|
|
7
|
+
|
|
8
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
|
9
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
|
10
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
|
11
|
+
#define DIV_FLOOR(a,b) (a/b)
|
|
12
|
+
|
|
13
|
+
// Multiple iterations for kernel measurements
|
|
14
|
+
#define ITERS 1
|
|
15
|
+
|
|
16
|
+
// Load the OpenCL kernel from file
|
|
17
|
+
char * get_source(const char* bones_filename) {
|
|
18
|
+
FILE* bones_fp = fopen(bones_filename,"r");
|
|
19
|
+
fseek(bones_fp,0,SEEK_END);
|
|
20
|
+
long bones_size = ftell(bones_fp);
|
|
21
|
+
rewind(bones_fp);
|
|
22
|
+
char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
|
|
23
|
+
int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
|
|
24
|
+
bones_source[bones_size] = '\0';
|
|
25
|
+
fclose(bones_fp);
|
|
26
|
+
return bones_source;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Print an error if it occurs
|
|
30
|
+
void error_check(cl_int bones_errors) {
|
|
31
|
+
if(bones_errors != CL_SUCCESS) {
|
|
32
|
+
switch (bones_errors) {
|
|
33
|
+
case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
|
|
34
|
+
case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
|
|
35
|
+
case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
|
|
36
|
+
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
|
|
37
|
+
case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
|
|
38
|
+
case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
|
|
39
|
+
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
|
|
40
|
+
case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
|
|
41
|
+
case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
|
|
42
|
+
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
|
|
43
|
+
case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
|
|
44
|
+
case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
|
|
45
|
+
case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
|
|
46
|
+
case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
|
|
47
|
+
case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
|
|
48
|
+
case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
|
|
49
|
+
case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
|
|
50
|
+
case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
|
|
51
|
+
case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
|
|
52
|
+
case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
|
|
53
|
+
case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
|
|
54
|
+
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
|
|
55
|
+
case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
|
|
56
|
+
case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
|
|
57
|
+
case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
|
|
58
|
+
case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
|
|
59
|
+
case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
|
|
60
|
+
case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
|
|
61
|
+
case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
|
|
62
|
+
case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
|
|
63
|
+
case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
|
|
64
|
+
case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
|
|
65
|
+
case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
|
|
66
|
+
case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
|
|
67
|
+
case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
|
|
68
|
+
case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
|
|
69
|
+
case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
|
|
70
|
+
case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
|
|
71
|
+
case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
|
|
72
|
+
case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
|
|
73
|
+
case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
|
|
74
|
+
case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
|
|
75
|
+
case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
|
|
76
|
+
case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
|
|
77
|
+
case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
|
|
78
|
+
default: printf("--- Error: Unknown with code %d\n", bones_errors);
|
|
79
|
+
}
|
|
80
|
+
fflush(stdout); exit(0);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Use a global variable for the device ID, context and command queue
|
|
85
|
+
cl_device_id bones_device;
|
|
86
|
+
cl_context bones_context;
|
|
87
|
+
cl_command_queue bones_queue;
|
|
88
|
+
|
|
89
|
+
// Use a global variable to store the name and the binary for the last program
|
|
90
|
+
char bones_last_program[1024];
|
|
91
|
+
cl_program bones_program;
|
|
92
|
+
|
|
93
|
+
// Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
|
|
94
|
+
void bones_initialize_target(void) {
|
|
95
|
+
cl_int bones_errors;
|
|
96
|
+
|
|
97
|
+
// Get OpenCL platform count
|
|
98
|
+
cl_uint bones_num_platforms;
|
|
99
|
+
bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
|
|
100
|
+
if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
|
|
101
|
+
|
|
102
|
+
// Get all OpenCL platform IDs
|
|
103
|
+
cl_platform_id bones_platform_ids[10];
|
|
104
|
+
bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
|
|
105
|
+
|
|
106
|
+
// Select the AMD APP platform
|
|
107
|
+
char bones_buffer[1024];
|
|
108
|
+
cl_uint bones_platform;
|
|
109
|
+
for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
|
|
110
|
+
clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
|
|
111
|
+
if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Get a CPU device on the platform
|
|
115
|
+
bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_GPU, 1, &bones_device, NULL); error_check(bones_errors);
|
|
116
|
+
bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
|
|
117
|
+
|
|
118
|
+
// Create a context
|
|
119
|
+
bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
|
|
120
|
+
|
|
121
|
+
// Create a command queue
|
|
122
|
+
bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
|
|
123
|
+
|
|
124
|
+
// Create space on the device
|
|
125
|
+
cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
|
|
126
|
+
|
|
127
|
+
// Copy something to the device
|
|
128
|
+
bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
|
|
129
|
+
|
|
130
|
+
// Clean-up the OpenCL context
|
|
131
|
+
strcpy(bones_last_program,"");
|
|
132
|
+
clReleaseMemObject(bones_device_data);
|
|
133
|
+
clReleaseContext(bones_context);
|
|
134
|
+
fflush(stdout);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Declaration of the original function
|
|
138
|
+
int bones_main(void);
|
|
139
|
+
|
|
140
|
+
// New main function for initialisation and clean-up
|
|
141
|
+
int main(void) {
|
|
142
|
+
|
|
143
|
+
// Initialisation
|
|
144
|
+
bones_initialize_target();
|
|
145
|
+
|
|
146
|
+
// Original main function
|
|
147
|
+
int bones_return = bones_main();
|
|
148
|
+
|
|
149
|
+
// Clean-up
|
|
150
|
+
clReleaseCommandQueue(bones_queue);
|
|
151
|
+
clReleaseProgram(bones_program);
|
|
152
|
+
clReleaseContext(bones_context);
|
|
153
|
+
return bones_return;
|
|
154
|
+
}
|
|
155
|
+
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
fflush(stdout);
|
|
2
|
+
cl_int bones_errors;
|
|
3
|
+
cl_event bones_event;
|
|
4
|
+
|
|
5
|
+
// Only compile if this program is different from the last one
|
|
6
|
+
if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
|
|
7
|
+
strcpy(bones_last_program,"<algorithm_filename>");
|
|
8
|
+
|
|
9
|
+
// Load and compile the kernel
|
|
10
|
+
char *bones_source = get_source("<algorithm_filename>_device.cl");
|
|
11
|
+
bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
|
|
12
|
+
bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
|
|
13
|
+
|
|
14
|
+
// Get and print the compiler log
|
|
15
|
+
char* bones_log;
|
|
16
|
+
size_t bones_log_size;
|
|
17
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
|
|
18
|
+
bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
|
|
19
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
|
|
20
|
+
bones_log[bones_log_size] = '\0';
|
|
21
|
+
//if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
|
|
22
|
+
free(bones_log);
|
|
23
|
+
error_check(bones_errors);
|
|
24
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
|
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
|
3
|
+
#if (ITERS == 1)
|
|
4
|
+
clFinish(bones_queue);
|
|
5
|
+
struct timeval bones_end_time1;
|
|
6
|
+
gettimeofday(&bones_end_time1, NULL);
|
|
7
|
+
float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
|
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
|
9
|
+
#endif
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
|
|
2
|
+
}
|
|
3
|
+
|
|
4
|
+
// Stop the timer for the measurement of the kernel execution time
|
|
5
|
+
clFinish(bones_queue);
|
|
6
|
+
cl_ulong end2, start2;
|
|
7
|
+
bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
|
|
8
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
|
|
9
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
|
|
10
|
+
float bones_timer2 = 0.000001 * (end2-start2);
|
|
11
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|