bones-compiler 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,6 @@
1
+
2
+ // Set the cache size to maximal
3
+ cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
4
+
5
+ // Stop execution directly if there is no work to do
6
+ if (<parallelism> <= 0) { return; }
@@ -0,0 +1,6 @@
1
+
2
+ // Start the timer for the measurement of the kernel and memory copy execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_start1;
5
+ cudaEventCreate(&bones_start1);
6
+ cudaEventRecord(bones_start1,0);
@@ -0,0 +1,10 @@
1
+
2
+ // End the timer for the measurement of the kernel and memory copy execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_stop1;
5
+ cudaEventCreate(&bones_stop1);
6
+ cudaEventRecord(bones_stop1,0);
7
+ cudaEventSynchronize(bones_stop1);
8
+ float bones_timer1 = 0;
9
+ cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
10
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
@@ -0,0 +1,6 @@
1
+
2
+ // Start the timer for the measurement of the kernel execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_start2;
5
+ cudaEventCreate(&bones_start2);
6
+ cudaEventRecord(bones_start2,0);
@@ -0,0 +1,10 @@
1
+
2
+ // Stop the timer for the measurement of the kernel execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_stop2;
5
+ cudaEventCreate(&bones_stop2);
6
+ cudaEventRecord(bones_stop2,0);
7
+ cudaEventSynchronize(bones_stop2);
8
+ float bones_timer2 = 0;
9
+ cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
10
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,105 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ #define SHUFFLE_X 16
5
+ #define SHUFFLE_Y 16
6
+
7
+ // Start of the <algorithm_name> kernel
8
+ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
9
+ const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
10
+ if (bones_global_id < (<parallelism>)) {
11
+
12
+ // Calculate the global ID(s) based on the thread id
13
+ <ids>
14
+
15
+ // Start the computation
16
+ <algorithm_code1>
17
+ }
18
+ }
19
+
20
+ // Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for first input
21
+ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
22
+ const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
23
+ const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
24
+
25
+ // Set-up the local memory for shuffling
26
+ __shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
27
+
28
+ // Swap the x and y coordinates to perform the rotation (coalesced)
29
+ if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
30
+ buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
31
+ }
32
+
33
+ // Synchronize all threads in the threadblock
34
+ __syncthreads();
35
+
36
+ // We don't have to swap the x and y thread indices here, because that's already done in the local memory
37
+ const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
38
+ const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
39
+
40
+ // Store the shuffled result (coalesced)
41
+ if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
42
+ shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
43
+ }
44
+ }
45
+
46
+ // Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for second input
47
+ __global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <in1_name>, <in1_type><in1_devicepointer> shuffled_<in1_name>, <argument_definition>) {
48
+ const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
49
+ const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
50
+
51
+ // Set-up the local memory for shuffling
52
+ __shared__ <in1_type> buffer[SHUFFLE_X][SHUFFLE_Y];
53
+
54
+ // Swap the x and y coordinates to perform the rotation (coalesced)
55
+ if (bones_global_id_0 < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1 < (<in1_parameters>)) {
56
+ buffer[threadIdx.y][threadIdx.x] = <in1_name>[bones_global_id_0 + bones_global_id_1 * ((<in1_dimensions>)/(<in1_parameters>))];
57
+ }
58
+
59
+ // Synchronize all threads in the threadblock
60
+ __syncthreads();
61
+
62
+ // We don't have to swap the x and y thread indices here, because that's already done in the local memory
63
+ const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
64
+ const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
65
+
66
+ // Store the shuffled result (coalesced)
67
+ if (bones_global_id_0_new < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1_new < (<in1_parameters>)) {
68
+ shuffled_<in1_name>[bones_global_id_0_new + bones_global_id_1_new * <in1_parameters>] = buffer[threadIdx.x][threadIdx.y];
69
+ }
70
+ }
71
+
72
+ // Function to start the kernel
73
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
74
+ int bones_block_size;
75
+ if (<parallelism> >= 64*512) { bones_block_size = 512;}
76
+ else if (<parallelism> >= 64*256) { bones_block_size = 256;}
77
+ else if (<parallelism> >= 64*128) { bones_block_size = 128;}
78
+ else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
79
+ else { bones_block_size = 32; }
80
+
81
+ // First perform some pre-shuffling (for the first input)
82
+ <in0_type>* shuffled_<in0_name> = 0;
83
+ cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
84
+ dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
85
+ dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
86
+ bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
87
+ <in0_type>* temp_<in0_name> = <in0_name>;
88
+ <in0_name> = shuffled_<in0_name>;
89
+ cudaFree(temp_<in0_name>);
90
+
91
+ // First perform some pre-shuffling (for the second input)
92
+ <in0_type>* shuffled_<in1_name> = 0;
93
+ cudaMalloc((void**)&shuffled_<in1_name>, <in1_dimensions>*sizeof(<in1_type>));
94
+ dim3 bones_threads2(SHUFFLE_X,SHUFFLE_Y);
95
+ dim3 bones_grid2(DIV_CEIL(((<in1_dimensions>)/(<in1_parameters>)),SHUFFLE_X),DIV_CEIL(<in1_parameters>,SHUFFLE_Y));
96
+ bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
97
+ <in1_type>* temp_<in1_name> = <in1_name>;
98
+ <in1_name> = shuffled_<in1_name>;
99
+ cudaFree(temp_<in1_name>);
100
+
101
+ // Then run the original kernel
102
+ dim3 bones_threads0(bones_block_size);
103
+ dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
104
+ bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
105
+ }
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,119 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel (main, not unrolled kernel)
5
+ __global__ void bones_kernel_<algorithm_name>_0(int bones_input_size, <in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
6
+ const int bones_threadblock_work = DIV_CEIL(bones_input_size,gridDim.x);
7
+ const int bones_parallel_work = BONES_MIN(blockDim.x,bones_threadblock_work);
8
+ const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
9
+ const int bones_local_id = threadIdx.x;
10
+ const int bones_global_id = blockIdx.x*bones_parallel_work + threadIdx.x;
11
+ <ids>
12
+ int bones_iter_id = <in0_flatindex>;
13
+
14
+ // Load data into thread private memory and perform the first computation(s) sequentially
15
+ <in0_type> bones_temporary = <in0_name>[bones_iter_id];
16
+ <in0_type> bones_private_memory = <algorithm_code3>;
17
+ for(int c=1; c<bones_sequential_work; c++) {
18
+ bones_iter_id = bones_iter_id + bones_parallel_work*gridDim.x<factors>;
19
+ if (bones_iter_id <= <in0_to>) {
20
+ bones_temporary = <in0_name>[bones_iter_id];
21
+ bones_private_memory = <algorithm_code1>;
22
+ }
23
+ }
24
+
25
+ // Initialize the local memory
26
+ volatile __shared__ <in0_type> bones_local_memory[512];
27
+ bones_local_memory[bones_local_id] = bones_private_memory;
28
+ __syncthreads();
29
+
30
+ // Perform the remainder of the computations in parallel using a parallel reduction tree
31
+ int bones_offset_id;
32
+ for (int c=512; c>=2; c=c>>1) {
33
+ if ((2*bones_parallel_work > c) && (threadIdx.x < c/2)) {
34
+ bones_offset_id = threadIdx.x+c/2;
35
+ if (bones_offset_id < bones_parallel_work) {
36
+ __syncthreads();
37
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
38
+ }
39
+ }
40
+ __syncthreads();
41
+ }
42
+
43
+ // Write the final result back to the global memory
44
+ if (threadIdx.x == 0) { <out0_name>[blockIdx.x] = bones_local_memory[0]; }
45
+ }
46
+
47
+ // Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
48
+ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
49
+ const int bones_local_id = threadIdx.x;
50
+ const int bones_global_id = threadIdx.x;
51
+
52
+ // Initialize the local memory
53
+ volatile __shared__ <in0_type> bones_local_memory[512];
54
+ bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
55
+ __syncthreads();
56
+
57
+ // Perform reduction using a parallel reduction tree
58
+ int bones_offset_id;
59
+ for (int c=128; c>=2; c=c>>1) {
60
+ if (threadIdx.x < c/2) {
61
+ bones_offset_id = threadIdx.x+c/2;
62
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
63
+ __syncthreads();
64
+ }
65
+ }
66
+
67
+ // Write the final result back to the global memory
68
+ if (threadIdx.x == 0) { <out0_name>[0] = bones_local_memory[0]; }
69
+ }
70
+
71
+ // Start of the <algorithm_name> kernel (final, initial value kernel)
72
+ __global__ void bones_kernel_<algorithm_name>_2(<out0_type><out0_devicepointer> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
73
+ <out0_type> bones_private_memory = <out0_name>[0];
74
+ <out0_type> bones_temporary = bones_initial_value[0];
75
+ <out0_name>[0] = <algorithm_code4>;
76
+ }
77
+
78
+ // Function to start the kernel
79
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
80
+
81
+ // Store the initial value
82
+ <out0_type>* bones_initial_value = 0;
83
+ cudaMalloc(&bones_initial_value, sizeof(<out0_type>));
84
+ cudaMemcpy(bones_initial_value, <out0_name>, sizeof(<out0_type>), cudaMemcpyDeviceToDevice);
85
+
86
+ // Run either one kernel or multiple kernels
87
+ if (<in0_dimensions> <= 1024) {
88
+
89
+ // Start only one kernel
90
+ const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
91
+ dim3 bones_threads(bones_num_threads);
92
+ dim3 bones_grid(1);
93
+ bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<in0_dimensions>,<in0_name>,<out0_name>,<argument_name>);
94
+ }
95
+ else {
96
+
97
+ // Allocate space for an intermediate array
98
+ <out0_type>* bones_device_temp = 0;
99
+ cudaMalloc(&bones_device_temp, 128*sizeof(<out0_type>));
100
+
101
+ // Start the first kernel
102
+ dim3 bones_threads1(512);
103
+ dim3 bones_grid1(128);
104
+ bones_kernel_<algorithm_name>_0<<< bones_grid1, bones_threads1 >>>(<in0_dimensions>,<in0_name>,bones_device_temp,<argument_name>);
105
+
106
+ // Start the second kernel
107
+ dim3 bones_threads2(128);
108
+ dim3 bones_grid2(1);
109
+ bones_kernel_<algorithm_name>_1<<< bones_grid2, bones_threads2 >>>(bones_device_temp,<out0_name>,<argument_name>);
110
+
111
+ cudaFree(bones_device_temp);
112
+ }
113
+
114
+ // Perform the last computation (only needed if there is an initial value)
115
+ dim3 bones_threads3(1);
116
+ dim3 bones_grid3(1);
117
+ bones_kernel_<algorithm_name>_2<<< bones_grid3, bones_threads3 >>>(bones_initial_value,<out0_name>,<argument_name>);
118
+ cudaFree(bones_initial_value);
119
+ }
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,166 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+
5
+ template<int SCALE>
6
+ __global__ void bones_kernel_<algorithm_name>_0(int *<in0_name>_index, <in0_type> *<in0_name>_value, int *<out0_name>, int const votecount)
7
+ {
8
+ int nbins = <out0_dimension0_sum>;
9
+ int nbins_part = ceilf((float)nbins / gridDim.y);
10
+ int part_offset = blockIdx.y * nbins_part;
11
+
12
+ //init temp. vote line in shared memory
13
+ extern __shared__ int votespace_line[];
14
+ for(int i=threadIdx.x; i<nbins_part*SCALE; i+=1024)
15
+ votespace_line[i] = 0;
16
+ __syncthreads();
17
+
18
+ // calculate start and stop index of input for sub-vote spaces
19
+ int start_index = blockIdx.z *votecount/gridDim.z + threadIdx.x;
20
+ int stop_index = min( (blockIdx.z+1)*votecount/gridDim.z , votecount);
21
+
22
+ for(int i=start_index; i<stop_index; i+=1024)
23
+ {
24
+ //int arr_val_index = <in0_name>_index[i];
25
+ <in0_type> arr_val_value = <in0_name>_value[i];
26
+ int vote_index = (int)((arr_val_value & 0x00FF) * (nbins / 256.0f));
27
+ vote_index = SCALE*vote_index + (threadIdx.x & (SCALE-1)) - part_offset;
28
+ int vote_value = 1; // Vote value
29
+ if(vote_index<(nbins_part*SCALE) && vote_index>=0)
30
+ atomicAdd(&votespace_line[vote_index], vote_value);
31
+ }
32
+ __syncthreads();
33
+
34
+ for(int i=threadIdx.x; i<nbins_part; i+=1024)
35
+ {
36
+ int value=0;
37
+ #pragma unroll
38
+ for(int j=0; j<SCALE; j++)
39
+ value += votespace_line[SCALE*i+j];
40
+
41
+ <out0_name>[blockIdx.z*nbins*gridDim.x +
42
+ blockIdx.x*nbins +
43
+ blockIdx.y*nbins_part + i] = value;
44
+ }
45
+ }
46
+
47
+ __global__ void bones_kernel_<algorithm_name>_1(int *in, int *out, int const num_subvotespaces, int const nbins)
48
+ {
49
+ // Identify the thread
50
+ int p = blockIdx.x*blockDim.x + threadIdx.x;
51
+ if(p>nbins)
52
+ return;
53
+
54
+ // Sum the sub-votespaces
55
+ int result = 0;
56
+ #pragma unroll
57
+ for (int i=0;i<num_subvotespaces;i++) {
58
+ result += in[blockIdx.y*num_subvotespaces*nbins + i*nbins + p];
59
+ }
60
+
61
+ // Write the results to off-chip memory
62
+ out[blockIdx.y*nbins + p] = result;
63
+ }
64
+
65
+ // Function to start the kernel
66
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
67
+ int * gpu_array_index = 0;
68
+ <in0_type> *gpu_array_value = <in0_name>;
69
+ int cpu_votecount = <in0_dimensions>;
70
+ int *gpu_votespace = (int*)<out0_name>;
71
+ int *gpu_temp = 0;
72
+
73
+ int nbins = <out0_dimension0_sum>;
74
+ int number_multiprocessors = 14;
75
+ int nbingroups = 1;
76
+
77
+ int scaling=8192/nbins;
78
+ int split_in_parts = 1;
79
+ int subvotespaces = 1;
80
+ int *gpu_out;
81
+
82
+ //calculate the scaling factor, and limit it to the values 1, 2, 4, 8, 16, 32, 64 and 128
83
+ if(scaling < 1) {
84
+ //too many bins requested, no scaling but splitting
85
+ scaling = 1;
86
+ split_in_parts = ceil(nbins / 8192.0f);
87
+ }
88
+ else if (scaling > 256) {
89
+ scaling = 256;
90
+ }
91
+ else {
92
+ int mask = 8192;
93
+ while(0 == (mask & scaling))
94
+ mask >>= 1;
95
+ scaling = mask;
96
+ }
97
+
98
+ if( (nbingroups*split_in_parts) < number_multiprocessors) {
99
+ int const maxsub = ceil((float)(<in0_dimensions>) / (float)(32*250));
100
+ cudaMalloc((void**)&gpu_temp, maxsub*nbingroups*nbins*sizeof(int));
101
+ if (gpu_temp != NULL) {
102
+ subvotespaces = number_multiprocessors / (nbingroups*split_in_parts);
103
+ gpu_out = gpu_temp;
104
+ }
105
+ else {
106
+ gpu_out = gpu_votespace;
107
+ }
108
+ }
109
+ else
110
+ {
111
+ gpu_out = gpu_votespace;
112
+ }
113
+
114
+ //scaling = 256;
115
+ //printf("%d %d %d %d %d\n", nbins, scaling, nbingroups, split_in_parts, subvotespaces);
116
+
117
+ dim3 dimensionsBlock1(1024);
118
+ dim3 dimensionsGrid1(nbingroups, split_in_parts, subvotespaces);
119
+ int const nbins_part = ceilf((float)nbins / split_in_parts);
120
+
121
+ switch(scaling) {
122
+ case 256:
123
+ bones_kernel_<algorithm_name>_0<256><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
124
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
125
+ break;
126
+ case 128:
127
+ bones_kernel_<algorithm_name>_0<128><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
128
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
129
+ break;
130
+ case 64:
131
+ bones_kernel_<algorithm_name>_0< 64><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
132
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
133
+ break;
134
+ case 32:
135
+ bones_kernel_<algorithm_name>_0< 32><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
136
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
137
+ break;
138
+ case 16:
139
+ bones_kernel_<algorithm_name>_0< 16><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
140
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
141
+ break;
142
+ case 8:
143
+ bones_kernel_<algorithm_name>_0< 8><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
144
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
145
+ break;
146
+ case 4:
147
+ bones_kernel_<algorithm_name>_0< 4><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
148
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
149
+ break;
150
+ case 2:
151
+ bones_kernel_<algorithm_name>_0< 2><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
152
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
153
+ break;
154
+ default:
155
+ bones_kernel_<algorithm_name>_0< 1><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
156
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
157
+ break;
158
+ }
159
+
160
+ if(subvotespaces > 1) {
161
+ dim3 dimensionsBlock2(min(nbins,1024));
162
+ dim3 dimensionsGrid2(ceil((float)nbins/(float)1024), nbingroups);
163
+ bones_kernel_<algorithm_name>_1<<<dimensionsGrid2, dimensionsBlock2>>>(gpu_out, gpu_votespace, subvotespaces, nbins);
164
+ cudaFree(gpu_temp);
165
+ }
166
+ }