bones-compiler 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
@@ -0,0 +1,10 @@
|
|
1
|
+
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
3
|
+
cudaThreadSynchronize();
|
4
|
+
cudaEvent_t bones_stop1;
|
5
|
+
cudaEventCreate(&bones_stop1);
|
6
|
+
cudaEventRecord(bones_stop1,0);
|
7
|
+
cudaEventSynchronize(bones_stop1);
|
8
|
+
float bones_timer1 = 0;
|
9
|
+
cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
|
10
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
@@ -0,0 +1,10 @@
|
|
1
|
+
|
2
|
+
// Stop the timer for the measurement of the kernel execution time
|
3
|
+
cudaThreadSynchronize();
|
4
|
+
cudaEvent_t bones_stop2;
|
5
|
+
cudaEventCreate(&bones_stop2);
|
6
|
+
cudaEventRecord(bones_stop2,0);
|
7
|
+
cudaEventSynchronize(bones_stop2);
|
8
|
+
float bones_timer2 = 0;
|
9
|
+
cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
|
10
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|
@@ -0,0 +1,105 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
#define SHUFFLE_X 16
|
5
|
+
#define SHUFFLE_Y 16
|
6
|
+
|
7
|
+
// Start of the <algorithm_name> kernel
|
8
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
9
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
10
|
+
if (bones_global_id < (<parallelism>)) {
|
11
|
+
|
12
|
+
// Calculate the global ID(s) based on the thread id
|
13
|
+
<ids>
|
14
|
+
|
15
|
+
// Start the computation
|
16
|
+
<algorithm_code1>
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
// Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for first input
|
21
|
+
__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
|
22
|
+
const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
|
23
|
+
const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
|
24
|
+
|
25
|
+
// Set-up the local memory for shuffling
|
26
|
+
__shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
|
27
|
+
|
28
|
+
// Swap the x and y coordinates to perform the rotation (coalesced)
|
29
|
+
if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
|
30
|
+
buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
|
31
|
+
}
|
32
|
+
|
33
|
+
// Synchronize all threads in the threadblock
|
34
|
+
__syncthreads();
|
35
|
+
|
36
|
+
// We don't have to swap the x and y thread indices here, because that's already done in the local memory
|
37
|
+
const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
|
38
|
+
const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
|
39
|
+
|
40
|
+
// Store the shuffled result (coalesced)
|
41
|
+
if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
|
42
|
+
shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
// Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for second input
|
47
|
+
__global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <in1_name>, <in1_type><in1_devicepointer> shuffled_<in1_name>, <argument_definition>) {
|
48
|
+
const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
|
49
|
+
const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
|
50
|
+
|
51
|
+
// Set-up the local memory for shuffling
|
52
|
+
__shared__ <in1_type> buffer[SHUFFLE_X][SHUFFLE_Y];
|
53
|
+
|
54
|
+
// Swap the x and y coordinates to perform the rotation (coalesced)
|
55
|
+
if (bones_global_id_0 < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1 < (<in1_parameters>)) {
|
56
|
+
buffer[threadIdx.y][threadIdx.x] = <in1_name>[bones_global_id_0 + bones_global_id_1 * ((<in1_dimensions>)/(<in1_parameters>))];
|
57
|
+
}
|
58
|
+
|
59
|
+
// Synchronize all threads in the threadblock
|
60
|
+
__syncthreads();
|
61
|
+
|
62
|
+
// We don't have to swap the x and y thread indices here, because that's already done in the local memory
|
63
|
+
const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
|
64
|
+
const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
|
65
|
+
|
66
|
+
// Store the shuffled result (coalesced)
|
67
|
+
if (bones_global_id_0_new < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1_new < (<in1_parameters>)) {
|
68
|
+
shuffled_<in1_name>[bones_global_id_0_new + bones_global_id_1_new * <in1_parameters>] = buffer[threadIdx.x][threadIdx.y];
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
// Function to start the kernel
|
73
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
74
|
+
int bones_block_size;
|
75
|
+
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
76
|
+
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
77
|
+
else if (<parallelism> >= 64*128) { bones_block_size = 128;}
|
78
|
+
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
79
|
+
else { bones_block_size = 32; }
|
80
|
+
|
81
|
+
// First perform some pre-shuffling (for the first input)
|
82
|
+
<in0_type>* shuffled_<in0_name> = 0;
|
83
|
+
cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
|
84
|
+
dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
|
85
|
+
dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
|
86
|
+
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
87
|
+
<in0_type>* temp_<in0_name> = <in0_name>;
|
88
|
+
<in0_name> = shuffled_<in0_name>;
|
89
|
+
cudaFree(temp_<in0_name>);
|
90
|
+
|
91
|
+
// First perform some pre-shuffling (for the second input)
|
92
|
+
<in0_type>* shuffled_<in1_name> = 0;
|
93
|
+
cudaMalloc((void**)&shuffled_<in1_name>, <in1_dimensions>*sizeof(<in1_type>));
|
94
|
+
dim3 bones_threads2(SHUFFLE_X,SHUFFLE_Y);
|
95
|
+
dim3 bones_grid2(DIV_CEIL(((<in1_dimensions>)/(<in1_parameters>)),SHUFFLE_X),DIV_CEIL(<in1_parameters>,SHUFFLE_Y));
|
96
|
+
bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
|
97
|
+
<in1_type>* temp_<in1_name> = <in1_name>;
|
98
|
+
<in1_name> = shuffled_<in1_name>;
|
99
|
+
cudaFree(temp_<in1_name>);
|
100
|
+
|
101
|
+
// Then run the original kernel
|
102
|
+
dim3 bones_threads0(bones_block_size);
|
103
|
+
dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
|
104
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
|
105
|
+
}
|
@@ -0,0 +1,119 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
// Start of the <algorithm_name> kernel (main, not unrolled kernel)
|
5
|
+
__global__ void bones_kernel_<algorithm_name>_0(int bones_input_size, <in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
6
|
+
const int bones_threadblock_work = DIV_CEIL(bones_input_size,gridDim.x);
|
7
|
+
const int bones_parallel_work = BONES_MIN(blockDim.x,bones_threadblock_work);
|
8
|
+
const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
|
9
|
+
const int bones_local_id = threadIdx.x;
|
10
|
+
const int bones_global_id = blockIdx.x*bones_parallel_work + threadIdx.x;
|
11
|
+
<ids>
|
12
|
+
int bones_iter_id = <in0_flatindex>;
|
13
|
+
|
14
|
+
// Load data into thread private memory and perform the first computation(s) sequentially
|
15
|
+
<in0_type> bones_temporary = <in0_name>[bones_iter_id];
|
16
|
+
<in0_type> bones_private_memory = <algorithm_code3>;
|
17
|
+
for(int c=1; c<bones_sequential_work; c++) {
|
18
|
+
bones_iter_id = bones_iter_id + bones_parallel_work*gridDim.x<factors>;
|
19
|
+
if (bones_iter_id <= <in0_to>) {
|
20
|
+
bones_temporary = <in0_name>[bones_iter_id];
|
21
|
+
bones_private_memory = <algorithm_code1>;
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
// Initialize the local memory
|
26
|
+
volatile __shared__ <in0_type> bones_local_memory[512];
|
27
|
+
bones_local_memory[bones_local_id] = bones_private_memory;
|
28
|
+
__syncthreads();
|
29
|
+
|
30
|
+
// Perform the remainder of the computations in parallel using a parallel reduction tree
|
31
|
+
int bones_offset_id;
|
32
|
+
for (int c=512; c>=2; c=c>>1) {
|
33
|
+
if ((2*bones_parallel_work > c) && (threadIdx.x < c/2)) {
|
34
|
+
bones_offset_id = threadIdx.x+c/2;
|
35
|
+
if (bones_offset_id < bones_parallel_work) {
|
36
|
+
__syncthreads();
|
37
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
38
|
+
}
|
39
|
+
}
|
40
|
+
__syncthreads();
|
41
|
+
}
|
42
|
+
|
43
|
+
// Write the final result back to the global memory
|
44
|
+
if (threadIdx.x == 0) { <out0_name>[blockIdx.x] = bones_local_memory[0]; }
|
45
|
+
}
|
46
|
+
|
47
|
+
// Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
|
48
|
+
__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
49
|
+
const int bones_local_id = threadIdx.x;
|
50
|
+
const int bones_global_id = threadIdx.x;
|
51
|
+
|
52
|
+
// Initialize the local memory
|
53
|
+
volatile __shared__ <in0_type> bones_local_memory[512];
|
54
|
+
bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
|
55
|
+
__syncthreads();
|
56
|
+
|
57
|
+
// Perform reduction using a parallel reduction tree
|
58
|
+
int bones_offset_id;
|
59
|
+
for (int c=128; c>=2; c=c>>1) {
|
60
|
+
if (threadIdx.x < c/2) {
|
61
|
+
bones_offset_id = threadIdx.x+c/2;
|
62
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
63
|
+
__syncthreads();
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
// Write the final result back to the global memory
|
68
|
+
if (threadIdx.x == 0) { <out0_name>[0] = bones_local_memory[0]; }
|
69
|
+
}
|
70
|
+
|
71
|
+
// Start of the <algorithm_name> kernel (final, initial value kernel)
|
72
|
+
__global__ void bones_kernel_<algorithm_name>_2(<out0_type><out0_devicepointer> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
73
|
+
<out0_type> bones_private_memory = <out0_name>[0];
|
74
|
+
<out0_type> bones_temporary = bones_initial_value[0];
|
75
|
+
<out0_name>[0] = <algorithm_code4>;
|
76
|
+
}
|
77
|
+
|
78
|
+
// Function to start the kernel
|
79
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
80
|
+
|
81
|
+
// Store the initial value
|
82
|
+
<out0_type>* bones_initial_value = 0;
|
83
|
+
cudaMalloc(&bones_initial_value, sizeof(<out0_type>));
|
84
|
+
cudaMemcpy(bones_initial_value, <out0_name>, sizeof(<out0_type>), cudaMemcpyDeviceToDevice);
|
85
|
+
|
86
|
+
// Run either one kernel or multiple kernels
|
87
|
+
if (<in0_dimensions> <= 1024) {
|
88
|
+
|
89
|
+
// Start only one kernel
|
90
|
+
const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
|
91
|
+
dim3 bones_threads(bones_num_threads);
|
92
|
+
dim3 bones_grid(1);
|
93
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<in0_dimensions>,<in0_name>,<out0_name>,<argument_name>);
|
94
|
+
}
|
95
|
+
else {
|
96
|
+
|
97
|
+
// Allocate space for an intermediate array
|
98
|
+
<out0_type>* bones_device_temp = 0;
|
99
|
+
cudaMalloc(&bones_device_temp, 128*sizeof(<out0_type>));
|
100
|
+
|
101
|
+
// Start the first kernel
|
102
|
+
dim3 bones_threads1(512);
|
103
|
+
dim3 bones_grid1(128);
|
104
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid1, bones_threads1 >>>(<in0_dimensions>,<in0_name>,bones_device_temp,<argument_name>);
|
105
|
+
|
106
|
+
// Start the second kernel
|
107
|
+
dim3 bones_threads2(128);
|
108
|
+
dim3 bones_grid2(1);
|
109
|
+
bones_kernel_<algorithm_name>_1<<< bones_grid2, bones_threads2 >>>(bones_device_temp,<out0_name>,<argument_name>);
|
110
|
+
|
111
|
+
cudaFree(bones_device_temp);
|
112
|
+
}
|
113
|
+
|
114
|
+
// Perform the last computation (only needed if there is an initial value)
|
115
|
+
dim3 bones_threads3(1);
|
116
|
+
dim3 bones_grid3(1);
|
117
|
+
bones_kernel_<algorithm_name>_2<<< bones_grid3, bones_threads3 >>>(bones_initial_value,<out0_name>,<argument_name>);
|
118
|
+
cudaFree(bones_initial_value);
|
119
|
+
}
|
@@ -0,0 +1,166 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
|
5
|
+
template<int SCALE>
|
6
|
+
__global__ void bones_kernel_<algorithm_name>_0(int *<in0_name>_index, <in0_type> *<in0_name>_value, int *<out0_name>, int const votecount)
|
7
|
+
{
|
8
|
+
int nbins = <out0_dimension0_sum>;
|
9
|
+
int nbins_part = ceilf((float)nbins / gridDim.y);
|
10
|
+
int part_offset = blockIdx.y * nbins_part;
|
11
|
+
|
12
|
+
//init temp. vote line in shared memory
|
13
|
+
extern __shared__ int votespace_line[];
|
14
|
+
for(int i=threadIdx.x; i<nbins_part*SCALE; i+=1024)
|
15
|
+
votespace_line[i] = 0;
|
16
|
+
__syncthreads();
|
17
|
+
|
18
|
+
// calculate start and stop index of input for sub-vote spaces
|
19
|
+
int start_index = blockIdx.z *votecount/gridDim.z + threadIdx.x;
|
20
|
+
int stop_index = min( (blockIdx.z+1)*votecount/gridDim.z , votecount);
|
21
|
+
|
22
|
+
for(int i=start_index; i<stop_index; i+=1024)
|
23
|
+
{
|
24
|
+
//int arr_val_index = <in0_name>_index[i];
|
25
|
+
<in0_type> arr_val_value = <in0_name>_value[i];
|
26
|
+
int vote_index = (int)((arr_val_value & 0x00FF) * (nbins / 256.0f));
|
27
|
+
vote_index = SCALE*vote_index + (threadIdx.x & (SCALE-1)) - part_offset;
|
28
|
+
int vote_value = 1; // Vote value
|
29
|
+
if(vote_index<(nbins_part*SCALE) && vote_index>=0)
|
30
|
+
atomicAdd(&votespace_line[vote_index], vote_value);
|
31
|
+
}
|
32
|
+
__syncthreads();
|
33
|
+
|
34
|
+
for(int i=threadIdx.x; i<nbins_part; i+=1024)
|
35
|
+
{
|
36
|
+
int value=0;
|
37
|
+
#pragma unroll
|
38
|
+
for(int j=0; j<SCALE; j++)
|
39
|
+
value += votespace_line[SCALE*i+j];
|
40
|
+
|
41
|
+
<out0_name>[blockIdx.z*nbins*gridDim.x +
|
42
|
+
blockIdx.x*nbins +
|
43
|
+
blockIdx.y*nbins_part + i] = value;
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
__global__ void bones_kernel_<algorithm_name>_1(int *in, int *out, int const num_subvotespaces, int const nbins)
|
48
|
+
{
|
49
|
+
// Identify the thread
|
50
|
+
int p = blockIdx.x*blockDim.x + threadIdx.x;
|
51
|
+
if(p>nbins)
|
52
|
+
return;
|
53
|
+
|
54
|
+
// Sum the sub-votespaces
|
55
|
+
int result = 0;
|
56
|
+
#pragma unroll
|
57
|
+
for (int i=0;i<num_subvotespaces;i++) {
|
58
|
+
result += in[blockIdx.y*num_subvotespaces*nbins + i*nbins + p];
|
59
|
+
}
|
60
|
+
|
61
|
+
// Write the results to off-chip memory
|
62
|
+
out[blockIdx.y*nbins + p] = result;
|
63
|
+
}
|
64
|
+
|
65
|
+
// Function to start the kernel
|
66
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
67
|
+
int * gpu_array_index = 0;
|
68
|
+
<in0_type> *gpu_array_value = <in0_name>;
|
69
|
+
int cpu_votecount = <in0_dimensions>;
|
70
|
+
int *gpu_votespace = (int*)<out0_name>;
|
71
|
+
int *gpu_temp = 0;
|
72
|
+
|
73
|
+
int nbins = <out0_dimension0_sum>;
|
74
|
+
int number_multiprocessors = 14;
|
75
|
+
int nbingroups = 1;
|
76
|
+
|
77
|
+
int scaling=8192/nbins;
|
78
|
+
int split_in_parts = 1;
|
79
|
+
int subvotespaces = 1;
|
80
|
+
int *gpu_out;
|
81
|
+
|
82
|
+
//calculate the scaling factor, and limit it to the values 1, 2, 4, 8, 16, 32, 64 and 128
|
83
|
+
if(scaling < 1) {
|
84
|
+
//too many bins requested, no scaling but splitting
|
85
|
+
scaling = 1;
|
86
|
+
split_in_parts = ceil(nbins / 8192.0f);
|
87
|
+
}
|
88
|
+
else if (scaling > 256) {
|
89
|
+
scaling = 256;
|
90
|
+
}
|
91
|
+
else {
|
92
|
+
int mask = 8192;
|
93
|
+
while(0 == (mask & scaling))
|
94
|
+
mask >>= 1;
|
95
|
+
scaling = mask;
|
96
|
+
}
|
97
|
+
|
98
|
+
if( (nbingroups*split_in_parts) < number_multiprocessors) {
|
99
|
+
int const maxsub = ceil((float)(<in0_dimensions>) / (float)(32*250));
|
100
|
+
cudaMalloc((void**)&gpu_temp, maxsub*nbingroups*nbins*sizeof(int));
|
101
|
+
if (gpu_temp != NULL) {
|
102
|
+
subvotespaces = number_multiprocessors / (nbingroups*split_in_parts);
|
103
|
+
gpu_out = gpu_temp;
|
104
|
+
}
|
105
|
+
else {
|
106
|
+
gpu_out = gpu_votespace;
|
107
|
+
}
|
108
|
+
}
|
109
|
+
else
|
110
|
+
{
|
111
|
+
gpu_out = gpu_votespace;
|
112
|
+
}
|
113
|
+
|
114
|
+
//scaling = 256;
|
115
|
+
//printf("%d %d %d %d %d\n", nbins, scaling, nbingroups, split_in_parts, subvotespaces);
|
116
|
+
|
117
|
+
dim3 dimensionsBlock1(1024);
|
118
|
+
dim3 dimensionsGrid1(nbingroups, split_in_parts, subvotespaces);
|
119
|
+
int const nbins_part = ceilf((float)nbins / split_in_parts);
|
120
|
+
|
121
|
+
switch(scaling) {
|
122
|
+
case 256:
|
123
|
+
bones_kernel_<algorithm_name>_0<256><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
124
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
125
|
+
break;
|
126
|
+
case 128:
|
127
|
+
bones_kernel_<algorithm_name>_0<128><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
128
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
129
|
+
break;
|
130
|
+
case 64:
|
131
|
+
bones_kernel_<algorithm_name>_0< 64><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
132
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
133
|
+
break;
|
134
|
+
case 32:
|
135
|
+
bones_kernel_<algorithm_name>_0< 32><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
136
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
137
|
+
break;
|
138
|
+
case 16:
|
139
|
+
bones_kernel_<algorithm_name>_0< 16><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
140
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
141
|
+
break;
|
142
|
+
case 8:
|
143
|
+
bones_kernel_<algorithm_name>_0< 8><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
144
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
145
|
+
break;
|
146
|
+
case 4:
|
147
|
+
bones_kernel_<algorithm_name>_0< 4><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
148
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
149
|
+
break;
|
150
|
+
case 2:
|
151
|
+
bones_kernel_<algorithm_name>_0< 2><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
152
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
153
|
+
break;
|
154
|
+
default:
|
155
|
+
bones_kernel_<algorithm_name>_0< 1><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
156
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
157
|
+
break;
|
158
|
+
}
|
159
|
+
|
160
|
+
if(subvotespaces > 1) {
|
161
|
+
dim3 dimensionsBlock2(min(nbins,1024));
|
162
|
+
dim3 dimensionsGrid2(ceil((float)nbins/(float)1024), nbingroups);
|
163
|
+
bones_kernel_<algorithm_name>_1<<<dimensionsGrid2, dimensionsBlock2>>>(gpu_out, gpu_votespace, subvotespaces, nbins);
|
164
|
+
cudaFree(gpu_temp);
|
165
|
+
}
|
166
|
+
}
|