bones-compiler 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,6 @@
1
+
2
+ // Set the cache size to maximal
3
+ cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
4
+
5
+ // Stop execution directly if there is no work to do
6
+ if (<parallelism> <= 0) { return; }
@@ -0,0 +1,6 @@
1
+
2
+ // Start the timer for the measurement of the kernel and memory copy execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_start1;
5
+ cudaEventCreate(&bones_start1);
6
+ cudaEventRecord(bones_start1,0);
@@ -0,0 +1,10 @@
1
+
2
+ // End the timer for the measurement of the kernel and memory copy execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_stop1;
5
+ cudaEventCreate(&bones_stop1);
6
+ cudaEventRecord(bones_stop1,0);
7
+ cudaEventSynchronize(bones_stop1);
8
+ float bones_timer1 = 0;
9
+ cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
10
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
@@ -0,0 +1,6 @@
1
+
2
+ // Start the timer for the measurement of the kernel execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_start2;
5
+ cudaEventCreate(&bones_start2);
6
+ cudaEventRecord(bones_start2,0);
@@ -0,0 +1,10 @@
1
+
2
+ // Stop the timer for the measurement of the kernel execution time
3
+ cudaThreadSynchronize();
4
+ cudaEvent_t bones_stop2;
5
+ cudaEventCreate(&bones_stop2);
6
+ cudaEventRecord(bones_stop2,0);
7
+ cudaEventSynchronize(bones_stop2);
8
+ float bones_timer2 = 0;
9
+ cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
10
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,105 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ #define SHUFFLE_X 16
5
+ #define SHUFFLE_Y 16
6
+
7
+ // Start of the <algorithm_name> kernel
8
+ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
9
+ const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
10
+ if (bones_global_id < (<parallelism>)) {
11
+
12
+ // Calculate the global ID(s) based on the thread id
13
+ <ids>
14
+
15
+ // Start the computation
16
+ <algorithm_code1>
17
+ }
18
+ }
19
+
20
+ // Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for first input
21
+ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
22
+ const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
23
+ const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
24
+
25
+ // Set-up the local memory for shuffling
26
+ __shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
27
+
28
+ // Swap the x and y coordinates to perform the rotation (coalesced)
29
+ if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
30
+ buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
31
+ }
32
+
33
+ // Synchronize all threads in the threadblock
34
+ __syncthreads();
35
+
36
+ // We don't have to swap the x and y thread indices here, because that's already done in the local memory
37
+ const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
38
+ const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
39
+
40
+ // Store the shuffled result (coalesced)
41
+ if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
42
+ shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
43
+ }
44
+ }
45
+
46
+ // Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for second input
47
+ __global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <in1_name>, <in1_type><in1_devicepointer> shuffled_<in1_name>, <argument_definition>) {
48
+ const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
49
+ const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
50
+
51
+ // Set-up the local memory for shuffling
52
+ __shared__ <in1_type> buffer[SHUFFLE_X][SHUFFLE_Y];
53
+
54
+ // Swap the x and y coordinates to perform the rotation (coalesced)
55
+ if (bones_global_id_0 < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1 < (<in1_parameters>)) {
56
+ buffer[threadIdx.y][threadIdx.x] = <in1_name>[bones_global_id_0 + bones_global_id_1 * ((<in1_dimensions>)/(<in1_parameters>))];
57
+ }
58
+
59
+ // Synchronize all threads in the threadblock
60
+ __syncthreads();
61
+
62
+ // We don't have to swap the x and y thread indices here, because that's already done in the local memory
63
+ const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
64
+ const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
65
+
66
+ // Store the shuffled result (coalesced)
67
+ if (bones_global_id_0_new < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1_new < (<in1_parameters>)) {
68
+ shuffled_<in1_name>[bones_global_id_0_new + bones_global_id_1_new * <in1_parameters>] = buffer[threadIdx.x][threadIdx.y];
69
+ }
70
+ }
71
+
72
+ // Function to start the kernel
73
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
74
+ int bones_block_size;
75
+ if (<parallelism> >= 64*512) { bones_block_size = 512;}
76
+ else if (<parallelism> >= 64*256) { bones_block_size = 256;}
77
+ else if (<parallelism> >= 64*128) { bones_block_size = 128;}
78
+ else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
79
+ else { bones_block_size = 32; }
80
+
81
+ // First perform some pre-shuffling (for the first input)
82
+ <in0_type>* shuffled_<in0_name> = 0;
83
+ cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
84
+ dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
85
+ dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
86
+ bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
87
+ <in0_type>* temp_<in0_name> = <in0_name>;
88
+ <in0_name> = shuffled_<in0_name>;
89
+ cudaFree(temp_<in0_name>);
90
+
91
+ // First perform some pre-shuffling (for the second input)
92
+ <in0_type>* shuffled_<in1_name> = 0;
93
+ cudaMalloc((void**)&shuffled_<in1_name>, <in1_dimensions>*sizeof(<in1_type>));
94
+ dim3 bones_threads2(SHUFFLE_X,SHUFFLE_Y);
95
+ dim3 bones_grid2(DIV_CEIL(((<in1_dimensions>)/(<in1_parameters>)),SHUFFLE_X),DIV_CEIL(<in1_parameters>,SHUFFLE_Y));
96
+ bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
97
+ <in1_type>* temp_<in1_name> = <in1_name>;
98
+ <in1_name> = shuffled_<in1_name>;
99
+ cudaFree(temp_<in1_name>);
100
+
101
+ // Then run the original kernel
102
+ dim3 bones_threads0(bones_block_size);
103
+ dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
104
+ bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
105
+ }
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,119 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel (main, not unrolled kernel)
5
+ __global__ void bones_kernel_<algorithm_name>_0(int bones_input_size, <in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
6
+ const int bones_threadblock_work = DIV_CEIL(bones_input_size,gridDim.x);
7
+ const int bones_parallel_work = BONES_MIN(blockDim.x,bones_threadblock_work);
8
+ const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
9
+ const int bones_local_id = threadIdx.x;
10
+ const int bones_global_id = blockIdx.x*bones_parallel_work + threadIdx.x;
11
+ <ids>
12
+ int bones_iter_id = <in0_flatindex>;
13
+
14
+ // Load data into thread private memory and perform the first computation(s) sequentially
15
+ <in0_type> bones_temporary = <in0_name>[bones_iter_id];
16
+ <in0_type> bones_private_memory = <algorithm_code3>;
17
+ for(int c=1; c<bones_sequential_work; c++) {
18
+ bones_iter_id = bones_iter_id + bones_parallel_work*gridDim.x<factors>;
19
+ if (bones_iter_id <= <in0_to>) {
20
+ bones_temporary = <in0_name>[bones_iter_id];
21
+ bones_private_memory = <algorithm_code1>;
22
+ }
23
+ }
24
+
25
+ // Initialize the local memory
26
+ volatile __shared__ <in0_type> bones_local_memory[512];
27
+ bones_local_memory[bones_local_id] = bones_private_memory;
28
+ __syncthreads();
29
+
30
+ // Perform the remainder of the computations in parallel using a parallel reduction tree
31
+ int bones_offset_id;
32
+ for (int c=512; c>=2; c=c>>1) {
33
+ if ((2*bones_parallel_work > c) && (threadIdx.x < c/2)) {
34
+ bones_offset_id = threadIdx.x+c/2;
35
+ if (bones_offset_id < bones_parallel_work) {
36
+ __syncthreads();
37
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
38
+ }
39
+ }
40
+ __syncthreads();
41
+ }
42
+
43
+ // Write the final result back to the global memory
44
+ if (threadIdx.x == 0) { <out0_name>[blockIdx.x] = bones_local_memory[0]; }
45
+ }
46
+
47
+ // Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
48
+ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
49
+ const int bones_local_id = threadIdx.x;
50
+ const int bones_global_id = threadIdx.x;
51
+
52
+ // Initialize the local memory
53
+ volatile __shared__ <in0_type> bones_local_memory[512];
54
+ bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
55
+ __syncthreads();
56
+
57
+ // Perform reduction using a parallel reduction tree
58
+ int bones_offset_id;
59
+ for (int c=128; c>=2; c=c>>1) {
60
+ if (threadIdx.x < c/2) {
61
+ bones_offset_id = threadIdx.x+c/2;
62
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
63
+ __syncthreads();
64
+ }
65
+ }
66
+
67
+ // Write the final result back to the global memory
68
+ if (threadIdx.x == 0) { <out0_name>[0] = bones_local_memory[0]; }
69
+ }
70
+
71
+ // Start of the <algorithm_name> kernel (final, initial value kernel)
72
+ __global__ void bones_kernel_<algorithm_name>_2(<out0_type><out0_devicepointer> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
73
+ <out0_type> bones_private_memory = <out0_name>[0];
74
+ <out0_type> bones_temporary = bones_initial_value[0];
75
+ <out0_name>[0] = <algorithm_code4>;
76
+ }
77
+
78
+ // Function to start the kernel
79
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
80
+
81
+ // Store the initial value
82
+ <out0_type>* bones_initial_value = 0;
83
+ cudaMalloc(&bones_initial_value, sizeof(<out0_type>));
84
+ cudaMemcpy(bones_initial_value, <out0_name>, sizeof(<out0_type>), cudaMemcpyDeviceToDevice);
85
+
86
+ // Run either one kernel or multiple kernels
87
+ if (<in0_dimensions> <= 1024) {
88
+
89
+ // Start only one kernel
90
+ const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
91
+ dim3 bones_threads(bones_num_threads);
92
+ dim3 bones_grid(1);
93
+ bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<in0_dimensions>,<in0_name>,<out0_name>,<argument_name>);
94
+ }
95
+ else {
96
+
97
+ // Allocate space for an intermediate array
98
+ <out0_type>* bones_device_temp = 0;
99
+ cudaMalloc(&bones_device_temp, 128*sizeof(<out0_type>));
100
+
101
+ // Start the first kernel
102
+ dim3 bones_threads1(512);
103
+ dim3 bones_grid1(128);
104
+ bones_kernel_<algorithm_name>_0<<< bones_grid1, bones_threads1 >>>(<in0_dimensions>,<in0_name>,bones_device_temp,<argument_name>);
105
+
106
+ // Start the second kernel
107
+ dim3 bones_threads2(128);
108
+ dim3 bones_grid2(1);
109
+ bones_kernel_<algorithm_name>_1<<< bones_grid2, bones_threads2 >>>(bones_device_temp,<out0_name>,<argument_name>);
110
+
111
+ cudaFree(bones_device_temp);
112
+ }
113
+
114
+ // Perform the last computation (only needed if there is an initial value)
115
+ dim3 bones_threads3(1);
116
+ dim3 bones_grid3(1);
117
+ bones_kernel_<algorithm_name>_2<<< bones_grid3, bones_threads3 >>>(bones_initial_value,<out0_name>,<argument_name>);
118
+ cudaFree(bones_initial_value);
119
+ }
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,166 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+
5
+ template<int SCALE>
6
+ __global__ void bones_kernel_<algorithm_name>_0(int *<in0_name>_index, <in0_type> *<in0_name>_value, int *<out0_name>, int const votecount)
7
+ {
8
+ int nbins = <out0_dimension0_sum>;
9
+ int nbins_part = ceilf((float)nbins / gridDim.y);
10
+ int part_offset = blockIdx.y * nbins_part;
11
+
12
+ //init temp. vote line in shared memory
13
+ extern __shared__ int votespace_line[];
14
+ for(int i=threadIdx.x; i<nbins_part*SCALE; i+=1024)
15
+ votespace_line[i] = 0;
16
+ __syncthreads();
17
+
18
+ // calculate start and stop index of input for sub-vote spaces
19
+ int start_index = blockIdx.z *votecount/gridDim.z + threadIdx.x;
20
+ int stop_index = min( (blockIdx.z+1)*votecount/gridDim.z , votecount);
21
+
22
+ for(int i=start_index; i<stop_index; i+=1024)
23
+ {
24
+ //int arr_val_index = <in0_name>_index[i];
25
+ <in0_type> arr_val_value = <in0_name>_value[i];
26
+ int vote_index = (int)((arr_val_value & 0x00FF) * (nbins / 256.0f));
27
+ vote_index = SCALE*vote_index + (threadIdx.x & (SCALE-1)) - part_offset;
28
+ int vote_value = 1; // Vote value
29
+ if(vote_index<(nbins_part*SCALE) && vote_index>=0)
30
+ atomicAdd(&votespace_line[vote_index], vote_value);
31
+ }
32
+ __syncthreads();
33
+
34
+ for(int i=threadIdx.x; i<nbins_part; i+=1024)
35
+ {
36
+ int value=0;
37
+ #pragma unroll
38
+ for(int j=0; j<SCALE; j++)
39
+ value += votespace_line[SCALE*i+j];
40
+
41
+ <out0_name>[blockIdx.z*nbins*gridDim.x +
42
+ blockIdx.x*nbins +
43
+ blockIdx.y*nbins_part + i] = value;
44
+ }
45
+ }
46
+
47
+ __global__ void bones_kernel_<algorithm_name>_1(int *in, int *out, int const num_subvotespaces, int const nbins)
48
+ {
49
+ // Identify the thread
50
+ int p = blockIdx.x*blockDim.x + threadIdx.x;
51
+ if(p>nbins)
52
+ return;
53
+
54
+ // Sum the sub-votespaces
55
+ int result = 0;
56
+ #pragma unroll
57
+ for (int i=0;i<num_subvotespaces;i++) {
58
+ result += in[blockIdx.y*num_subvotespaces*nbins + i*nbins + p];
59
+ }
60
+
61
+ // Write the results to off-chip memory
62
+ out[blockIdx.y*nbins + p] = result;
63
+ }
64
+
65
+ // Function to start the kernel
66
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
67
+ int * gpu_array_index = 0;
68
+ <in0_type> *gpu_array_value = <in0_name>;
69
+ int cpu_votecount = <in0_dimensions>;
70
+ int *gpu_votespace = (int*)<out0_name>;
71
+ int *gpu_temp = 0;
72
+
73
+ int nbins = <out0_dimension0_sum>;
74
+ int number_multiprocessors = 14;
75
+ int nbingroups = 1;
76
+
77
+ int scaling=8192/nbins;
78
+ int split_in_parts = 1;
79
+ int subvotespaces = 1;
80
+ int *gpu_out;
81
+
82
+ //calculate the scaling factor, and limit it to the values 1, 2, 4, 8, 16, 32, 64 and 128
83
+ if(scaling < 1) {
84
+ //too many bins requested, no scaling but splitting
85
+ scaling = 1;
86
+ split_in_parts = ceil(nbins / 8192.0f);
87
+ }
88
+ else if (scaling > 256) {
89
+ scaling = 256;
90
+ }
91
+ else {
92
+ int mask = 8192;
93
+ while(0 == (mask & scaling))
94
+ mask >>= 1;
95
+ scaling = mask;
96
+ }
97
+
98
+ if( (nbingroups*split_in_parts) < number_multiprocessors) {
99
+ int const maxsub = ceil((float)(<in0_dimensions>) / (float)(32*250));
100
+ cudaMalloc((void**)&gpu_temp, maxsub*nbingroups*nbins*sizeof(int));
101
+ if (gpu_temp != NULL) {
102
+ subvotespaces = number_multiprocessors / (nbingroups*split_in_parts);
103
+ gpu_out = gpu_temp;
104
+ }
105
+ else {
106
+ gpu_out = gpu_votespace;
107
+ }
108
+ }
109
+ else
110
+ {
111
+ gpu_out = gpu_votespace;
112
+ }
113
+
114
+ //scaling = 256;
115
+ //printf("%d %d %d %d %d\n", nbins, scaling, nbingroups, split_in_parts, subvotespaces);
116
+
117
+ dim3 dimensionsBlock1(1024);
118
+ dim3 dimensionsGrid1(nbingroups, split_in_parts, subvotespaces);
119
+ int const nbins_part = ceilf((float)nbins / split_in_parts);
120
+
121
+ switch(scaling) {
122
+ case 256:
123
+ bones_kernel_<algorithm_name>_0<256><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
124
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
125
+ break;
126
+ case 128:
127
+ bones_kernel_<algorithm_name>_0<128><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
128
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
129
+ break;
130
+ case 64:
131
+ bones_kernel_<algorithm_name>_0< 64><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
132
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
133
+ break;
134
+ case 32:
135
+ bones_kernel_<algorithm_name>_0< 32><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
136
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
137
+ break;
138
+ case 16:
139
+ bones_kernel_<algorithm_name>_0< 16><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
140
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
141
+ break;
142
+ case 8:
143
+ bones_kernel_<algorithm_name>_0< 8><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
144
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
145
+ break;
146
+ case 4:
147
+ bones_kernel_<algorithm_name>_0< 4><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
148
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
149
+ break;
150
+ case 2:
151
+ bones_kernel_<algorithm_name>_0< 2><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
152
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
153
+ break;
154
+ default:
155
+ bones_kernel_<algorithm_name>_0< 1><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
156
+ (gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
157
+ break;
158
+ }
159
+
160
+ if(subvotespaces > 1) {
161
+ dim3 dimensionsBlock2(min(nbins,1024));
162
+ dim3 dimensionsGrid2(ceil((float)nbins/(float)1024), nbingroups);
163
+ bones_kernel_<algorithm_name>_1<<<dimensionsGrid2, dimensionsBlock2>>>(gpu_out, gpu_votespace, subvotespaces, nbins);
164
+ cudaFree(gpu_temp);
165
+ }
166
+ }