bones-compiler 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
|
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
|
3
|
+
cudaThreadSynchronize();
|
|
4
|
+
cudaEvent_t bones_stop1;
|
|
5
|
+
cudaEventCreate(&bones_stop1);
|
|
6
|
+
cudaEventRecord(bones_stop1,0);
|
|
7
|
+
cudaEventSynchronize(bones_stop1);
|
|
8
|
+
float bones_timer1 = 0;
|
|
9
|
+
cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
|
|
10
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
|
|
2
|
+
// Stop the timer for the measurement of the kernel execution time
|
|
3
|
+
cudaThreadSynchronize();
|
|
4
|
+
cudaEvent_t bones_stop2;
|
|
5
|
+
cudaEventCreate(&bones_stop2);
|
|
6
|
+
cudaEventRecord(bones_stop2,0);
|
|
7
|
+
cudaEventSynchronize(bones_stop2);
|
|
8
|
+
float bones_timer2 = 0;
|
|
9
|
+
cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
|
|
10
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
#define SHUFFLE_X 16
|
|
5
|
+
#define SHUFFLE_Y 16
|
|
6
|
+
|
|
7
|
+
// Start of the <algorithm_name> kernel
|
|
8
|
+
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
9
|
+
const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
|
|
10
|
+
if (bones_global_id < (<parallelism>)) {
|
|
11
|
+
|
|
12
|
+
// Calculate the global ID(s) based on the thread id
|
|
13
|
+
<ids>
|
|
14
|
+
|
|
15
|
+
// Start the computation
|
|
16
|
+
<algorithm_code1>
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for first input
|
|
21
|
+
__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
|
|
22
|
+
const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
|
|
23
|
+
const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
|
|
24
|
+
|
|
25
|
+
// Set-up the local memory for shuffling
|
|
26
|
+
__shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
|
|
27
|
+
|
|
28
|
+
// Swap the x and y coordinates to perform the rotation (coalesced)
|
|
29
|
+
if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
|
|
30
|
+
buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Synchronize all threads in the threadblock
|
|
34
|
+
__syncthreads();
|
|
35
|
+
|
|
36
|
+
// We don't have to swap the x and y thread indices here, because that's already done in the local memory
|
|
37
|
+
const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
|
|
38
|
+
const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
|
|
39
|
+
|
|
40
|
+
// Store the shuffled result (coalesced)
|
|
41
|
+
if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
|
|
42
|
+
shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for second input
|
|
47
|
+
__global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <in1_name>, <in1_type><in1_devicepointer> shuffled_<in1_name>, <argument_definition>) {
|
|
48
|
+
const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
|
|
49
|
+
const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
|
|
50
|
+
|
|
51
|
+
// Set-up the local memory for shuffling
|
|
52
|
+
__shared__ <in1_type> buffer[SHUFFLE_X][SHUFFLE_Y];
|
|
53
|
+
|
|
54
|
+
// Swap the x and y coordinates to perform the rotation (coalesced)
|
|
55
|
+
if (bones_global_id_0 < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1 < (<in1_parameters>)) {
|
|
56
|
+
buffer[threadIdx.y][threadIdx.x] = <in1_name>[bones_global_id_0 + bones_global_id_1 * ((<in1_dimensions>)/(<in1_parameters>))];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Synchronize all threads in the threadblock
|
|
60
|
+
__syncthreads();
|
|
61
|
+
|
|
62
|
+
// We don't have to swap the x and y thread indices here, because that's already done in the local memory
|
|
63
|
+
const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
|
|
64
|
+
const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
|
|
65
|
+
|
|
66
|
+
// Store the shuffled result (coalesced)
|
|
67
|
+
if (bones_global_id_0_new < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1_new < (<in1_parameters>)) {
|
|
68
|
+
shuffled_<in1_name>[bones_global_id_0_new + bones_global_id_1_new * <in1_parameters>] = buffer[threadIdx.x][threadIdx.y];
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Function to start the kernel
|
|
73
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
74
|
+
int bones_block_size;
|
|
75
|
+
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
|
76
|
+
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
|
77
|
+
else if (<parallelism> >= 64*128) { bones_block_size = 128;}
|
|
78
|
+
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
|
79
|
+
else { bones_block_size = 32; }
|
|
80
|
+
|
|
81
|
+
// First perform some pre-shuffling (for the first input)
|
|
82
|
+
<in0_type>* shuffled_<in0_name> = 0;
|
|
83
|
+
cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
|
|
84
|
+
dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
|
|
85
|
+
dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
|
|
86
|
+
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
|
87
|
+
<in0_type>* temp_<in0_name> = <in0_name>;
|
|
88
|
+
<in0_name> = shuffled_<in0_name>;
|
|
89
|
+
cudaFree(temp_<in0_name>);
|
|
90
|
+
|
|
91
|
+
// First perform some pre-shuffling (for the second input)
|
|
92
|
+
<in0_type>* shuffled_<in1_name> = 0;
|
|
93
|
+
cudaMalloc((void**)&shuffled_<in1_name>, <in1_dimensions>*sizeof(<in1_type>));
|
|
94
|
+
dim3 bones_threads2(SHUFFLE_X,SHUFFLE_Y);
|
|
95
|
+
dim3 bones_grid2(DIV_CEIL(((<in1_dimensions>)/(<in1_parameters>)),SHUFFLE_X),DIV_CEIL(<in1_parameters>,SHUFFLE_Y));
|
|
96
|
+
bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
|
|
97
|
+
<in1_type>* temp_<in1_name> = <in1_name>;
|
|
98
|
+
<in1_name> = shuffled_<in1_name>;
|
|
99
|
+
cudaFree(temp_<in1_name>);
|
|
100
|
+
|
|
101
|
+
// Then run the original kernel
|
|
102
|
+
dim3 bones_threads0(bones_block_size);
|
|
103
|
+
dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
|
|
104
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
|
|
105
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
// Start of the <algorithm_name> kernel (main, not unrolled kernel)
|
|
5
|
+
__global__ void bones_kernel_<algorithm_name>_0(int bones_input_size, <in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
|
6
|
+
const int bones_threadblock_work = DIV_CEIL(bones_input_size,gridDim.x);
|
|
7
|
+
const int bones_parallel_work = BONES_MIN(blockDim.x,bones_threadblock_work);
|
|
8
|
+
const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
|
|
9
|
+
const int bones_local_id = threadIdx.x;
|
|
10
|
+
const int bones_global_id = blockIdx.x*bones_parallel_work + threadIdx.x;
|
|
11
|
+
<ids>
|
|
12
|
+
int bones_iter_id = <in0_flatindex>;
|
|
13
|
+
|
|
14
|
+
// Load data into thread private memory and perform the first computation(s) sequentially
|
|
15
|
+
<in0_type> bones_temporary = <in0_name>[bones_iter_id];
|
|
16
|
+
<in0_type> bones_private_memory = <algorithm_code3>;
|
|
17
|
+
for(int c=1; c<bones_sequential_work; c++) {
|
|
18
|
+
bones_iter_id = bones_iter_id + bones_parallel_work*gridDim.x<factors>;
|
|
19
|
+
if (bones_iter_id <= <in0_to>) {
|
|
20
|
+
bones_temporary = <in0_name>[bones_iter_id];
|
|
21
|
+
bones_private_memory = <algorithm_code1>;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Initialize the local memory
|
|
26
|
+
volatile __shared__ <in0_type> bones_local_memory[512];
|
|
27
|
+
bones_local_memory[bones_local_id] = bones_private_memory;
|
|
28
|
+
__syncthreads();
|
|
29
|
+
|
|
30
|
+
// Perform the remainder of the computations in parallel using a parallel reduction tree
|
|
31
|
+
int bones_offset_id;
|
|
32
|
+
for (int c=512; c>=2; c=c>>1) {
|
|
33
|
+
if ((2*bones_parallel_work > c) && (threadIdx.x < c/2)) {
|
|
34
|
+
bones_offset_id = threadIdx.x+c/2;
|
|
35
|
+
if (bones_offset_id < bones_parallel_work) {
|
|
36
|
+
__syncthreads();
|
|
37
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
__syncthreads();
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Write the final result back to the global memory
|
|
44
|
+
if (threadIdx.x == 0) { <out0_name>[blockIdx.x] = bones_local_memory[0]; }
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
|
|
48
|
+
__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
|
49
|
+
const int bones_local_id = threadIdx.x;
|
|
50
|
+
const int bones_global_id = threadIdx.x;
|
|
51
|
+
|
|
52
|
+
// Initialize the local memory
|
|
53
|
+
volatile __shared__ <in0_type> bones_local_memory[512];
|
|
54
|
+
bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
|
|
55
|
+
__syncthreads();
|
|
56
|
+
|
|
57
|
+
// Perform reduction using a parallel reduction tree
|
|
58
|
+
int bones_offset_id;
|
|
59
|
+
for (int c=128; c>=2; c=c>>1) {
|
|
60
|
+
if (threadIdx.x < c/2) {
|
|
61
|
+
bones_offset_id = threadIdx.x+c/2;
|
|
62
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
|
63
|
+
__syncthreads();
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Write the final result back to the global memory
|
|
68
|
+
if (threadIdx.x == 0) { <out0_name>[0] = bones_local_memory[0]; }
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Start of the <algorithm_name> kernel (final, initial value kernel)
|
|
72
|
+
__global__ void bones_kernel_<algorithm_name>_2(<out0_type><out0_devicepointer> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
|
73
|
+
<out0_type> bones_private_memory = <out0_name>[0];
|
|
74
|
+
<out0_type> bones_temporary = bones_initial_value[0];
|
|
75
|
+
<out0_name>[0] = <algorithm_code4>;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Function to start the kernel
|
|
79
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
80
|
+
|
|
81
|
+
// Store the initial value
|
|
82
|
+
<out0_type>* bones_initial_value = 0;
|
|
83
|
+
cudaMalloc(&bones_initial_value, sizeof(<out0_type>));
|
|
84
|
+
cudaMemcpy(bones_initial_value, <out0_name>, sizeof(<out0_type>), cudaMemcpyDeviceToDevice);
|
|
85
|
+
|
|
86
|
+
// Run either one kernel or multiple kernels
|
|
87
|
+
if (<in0_dimensions> <= 1024) {
|
|
88
|
+
|
|
89
|
+
// Start only one kernel
|
|
90
|
+
const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
|
|
91
|
+
dim3 bones_threads(bones_num_threads);
|
|
92
|
+
dim3 bones_grid(1);
|
|
93
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<in0_dimensions>,<in0_name>,<out0_name>,<argument_name>);
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
|
|
97
|
+
// Allocate space for an intermediate array
|
|
98
|
+
<out0_type>* bones_device_temp = 0;
|
|
99
|
+
cudaMalloc(&bones_device_temp, 128*sizeof(<out0_type>));
|
|
100
|
+
|
|
101
|
+
// Start the first kernel
|
|
102
|
+
dim3 bones_threads1(512);
|
|
103
|
+
dim3 bones_grid1(128);
|
|
104
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid1, bones_threads1 >>>(<in0_dimensions>,<in0_name>,bones_device_temp,<argument_name>);
|
|
105
|
+
|
|
106
|
+
// Start the second kernel
|
|
107
|
+
dim3 bones_threads2(128);
|
|
108
|
+
dim3 bones_grid2(1);
|
|
109
|
+
bones_kernel_<algorithm_name>_1<<< bones_grid2, bones_threads2 >>>(bones_device_temp,<out0_name>,<argument_name>);
|
|
110
|
+
|
|
111
|
+
cudaFree(bones_device_temp);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Perform the last computation (only needed if there is an initial value)
|
|
115
|
+
dim3 bones_threads3(1);
|
|
116
|
+
dim3 bones_grid3(1);
|
|
117
|
+
bones_kernel_<algorithm_name>_2<<< bones_grid3, bones_threads3 >>>(bones_initial_value,<out0_name>,<argument_name>);
|
|
118
|
+
cudaFree(bones_initial_value);
|
|
119
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
|
|
5
|
+
template<int SCALE>
|
|
6
|
+
__global__ void bones_kernel_<algorithm_name>_0(int *<in0_name>_index, <in0_type> *<in0_name>_value, int *<out0_name>, int const votecount)
|
|
7
|
+
{
|
|
8
|
+
int nbins = <out0_dimension0_sum>;
|
|
9
|
+
int nbins_part = ceilf((float)nbins / gridDim.y);
|
|
10
|
+
int part_offset = blockIdx.y * nbins_part;
|
|
11
|
+
|
|
12
|
+
//init temp. vote line in shared memory
|
|
13
|
+
extern __shared__ int votespace_line[];
|
|
14
|
+
for(int i=threadIdx.x; i<nbins_part*SCALE; i+=1024)
|
|
15
|
+
votespace_line[i] = 0;
|
|
16
|
+
__syncthreads();
|
|
17
|
+
|
|
18
|
+
// calculate start and stop index of input for sub-vote spaces
|
|
19
|
+
int start_index = blockIdx.z *votecount/gridDim.z + threadIdx.x;
|
|
20
|
+
int stop_index = min( (blockIdx.z+1)*votecount/gridDim.z , votecount);
|
|
21
|
+
|
|
22
|
+
for(int i=start_index; i<stop_index; i+=1024)
|
|
23
|
+
{
|
|
24
|
+
//int arr_val_index = <in0_name>_index[i];
|
|
25
|
+
<in0_type> arr_val_value = <in0_name>_value[i];
|
|
26
|
+
int vote_index = (int)((arr_val_value & 0x00FF) * (nbins / 256.0f));
|
|
27
|
+
vote_index = SCALE*vote_index + (threadIdx.x & (SCALE-1)) - part_offset;
|
|
28
|
+
int vote_value = 1; // Vote value
|
|
29
|
+
if(vote_index<(nbins_part*SCALE) && vote_index>=0)
|
|
30
|
+
atomicAdd(&votespace_line[vote_index], vote_value);
|
|
31
|
+
}
|
|
32
|
+
__syncthreads();
|
|
33
|
+
|
|
34
|
+
for(int i=threadIdx.x; i<nbins_part; i+=1024)
|
|
35
|
+
{
|
|
36
|
+
int value=0;
|
|
37
|
+
#pragma unroll
|
|
38
|
+
for(int j=0; j<SCALE; j++)
|
|
39
|
+
value += votespace_line[SCALE*i+j];
|
|
40
|
+
|
|
41
|
+
<out0_name>[blockIdx.z*nbins*gridDim.x +
|
|
42
|
+
blockIdx.x*nbins +
|
|
43
|
+
blockIdx.y*nbins_part + i] = value;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
__global__ void bones_kernel_<algorithm_name>_1(int *in, int *out, int const num_subvotespaces, int const nbins)
|
|
48
|
+
{
|
|
49
|
+
// Identify the thread
|
|
50
|
+
int p = blockIdx.x*blockDim.x + threadIdx.x;
|
|
51
|
+
if(p>nbins)
|
|
52
|
+
return;
|
|
53
|
+
|
|
54
|
+
// Sum the sub-votespaces
|
|
55
|
+
int result = 0;
|
|
56
|
+
#pragma unroll
|
|
57
|
+
for (int i=0;i<num_subvotespaces;i++) {
|
|
58
|
+
result += in[blockIdx.y*num_subvotespaces*nbins + i*nbins + p];
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Write the results to off-chip memory
|
|
62
|
+
out[blockIdx.y*nbins + p] = result;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Function to start the kernel
|
|
66
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
67
|
+
int * gpu_array_index = 0;
|
|
68
|
+
<in0_type> *gpu_array_value = <in0_name>;
|
|
69
|
+
int cpu_votecount = <in0_dimensions>;
|
|
70
|
+
int *gpu_votespace = (int*)<out0_name>;
|
|
71
|
+
int *gpu_temp = 0;
|
|
72
|
+
|
|
73
|
+
int nbins = <out0_dimension0_sum>;
|
|
74
|
+
int number_multiprocessors = 14;
|
|
75
|
+
int nbingroups = 1;
|
|
76
|
+
|
|
77
|
+
int scaling=8192/nbins;
|
|
78
|
+
int split_in_parts = 1;
|
|
79
|
+
int subvotespaces = 1;
|
|
80
|
+
int *gpu_out;
|
|
81
|
+
|
|
82
|
+
//calculate the scaling factor, and limit it to the values 1, 2, 4, 8, 16, 32, 64 and 128
|
|
83
|
+
if(scaling < 1) {
|
|
84
|
+
//too many bins requested, no scaling but splitting
|
|
85
|
+
scaling = 1;
|
|
86
|
+
split_in_parts = ceil(nbins / 8192.0f);
|
|
87
|
+
}
|
|
88
|
+
else if (scaling > 256) {
|
|
89
|
+
scaling = 256;
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
int mask = 8192;
|
|
93
|
+
while(0 == (mask & scaling))
|
|
94
|
+
mask >>= 1;
|
|
95
|
+
scaling = mask;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if( (nbingroups*split_in_parts) < number_multiprocessors) {
|
|
99
|
+
int const maxsub = ceil((float)(<in0_dimensions>) / (float)(32*250));
|
|
100
|
+
cudaMalloc((void**)&gpu_temp, maxsub*nbingroups*nbins*sizeof(int));
|
|
101
|
+
if (gpu_temp != NULL) {
|
|
102
|
+
subvotespaces = number_multiprocessors / (nbingroups*split_in_parts);
|
|
103
|
+
gpu_out = gpu_temp;
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
gpu_out = gpu_votespace;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
else
|
|
110
|
+
{
|
|
111
|
+
gpu_out = gpu_votespace;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
//scaling = 256;
|
|
115
|
+
//printf("%d %d %d %d %d\n", nbins, scaling, nbingroups, split_in_parts, subvotespaces);
|
|
116
|
+
|
|
117
|
+
dim3 dimensionsBlock1(1024);
|
|
118
|
+
dim3 dimensionsGrid1(nbingroups, split_in_parts, subvotespaces);
|
|
119
|
+
int const nbins_part = ceilf((float)nbins / split_in_parts);
|
|
120
|
+
|
|
121
|
+
switch(scaling) {
|
|
122
|
+
case 256:
|
|
123
|
+
bones_kernel_<algorithm_name>_0<256><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
124
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
125
|
+
break;
|
|
126
|
+
case 128:
|
|
127
|
+
bones_kernel_<algorithm_name>_0<128><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
128
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
129
|
+
break;
|
|
130
|
+
case 64:
|
|
131
|
+
bones_kernel_<algorithm_name>_0< 64><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
132
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
133
|
+
break;
|
|
134
|
+
case 32:
|
|
135
|
+
bones_kernel_<algorithm_name>_0< 32><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
136
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
137
|
+
break;
|
|
138
|
+
case 16:
|
|
139
|
+
bones_kernel_<algorithm_name>_0< 16><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
140
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
141
|
+
break;
|
|
142
|
+
case 8:
|
|
143
|
+
bones_kernel_<algorithm_name>_0< 8><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
144
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
145
|
+
break;
|
|
146
|
+
case 4:
|
|
147
|
+
bones_kernel_<algorithm_name>_0< 4><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
148
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
149
|
+
break;
|
|
150
|
+
case 2:
|
|
151
|
+
bones_kernel_<algorithm_name>_0< 2><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
152
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
153
|
+
break;
|
|
154
|
+
default:
|
|
155
|
+
bones_kernel_<algorithm_name>_0< 1><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
|
|
156
|
+
(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if(subvotespaces > 1) {
|
|
161
|
+
dim3 dimensionsBlock2(min(nbins,1024));
|
|
162
|
+
dim3 dimensionsGrid2(ceil((float)nbins/(float)1024), nbingroups);
|
|
163
|
+
bones_kernel_<algorithm_name>_1<<<dimensionsGrid2, dimensionsBlock2>>>(gpu_out, gpu_votespace, subvotespaces, nbins);
|
|
164
|
+
cudaFree(gpu_temp);
|
|
165
|
+
}
|
|
166
|
+
}
|