bones-compiler 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,69 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ #define SHUFFLE_X 16
5
+ #define SHUFFLE_Y 16
6
+
7
+ // Start of the <algorithm_name> kernel
8
+ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
9
+ const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
10
+ if (bones_global_id < (<parallelism>)) {
11
+
12
+ // Calculate the global ID(s) based on the thread id
13
+ <ids>
14
+
15
+ // Start the computation
16
+ <algorithm_code1>
17
+ }
18
+ }
19
+
20
+ // Start of the <algorithm_name> kernel (pre-kernel for shuffling)
21
+ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
22
+ const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
23
+ const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
24
+
25
+ // Set-up the local memory for shuffling
26
+ __shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
27
+
28
+ // Swap the x and y coordinates to perform the rotation (coalesced)
29
+ if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
30
+ buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
31
+ }
32
+
33
+ // Synchronize all threads in the threadblock
34
+ __syncthreads();
35
+
36
+ // We don't have to swap the x and y thread indices here, because that's already done in the local memory
37
+ const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
38
+ const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
39
+
40
+ // Store the shuffled result (coalesced)
41
+ if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
42
+ shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] = buffer[threadIdx.x][threadIdx.y];
43
+ }
44
+ }
45
+
46
+ // Function to start the kernel
47
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
48
+ int bones_block_size;
49
+ if (<parallelism> >= 64*512) { bones_block_size = 512;}
50
+ else if (<parallelism> >= 64*256) { bones_block_size = 256;}
51
+ else if (<parallelism> >= 64*128) { bones_block_size = 128;}
52
+ else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
53
+ else { bones_block_size = 32; }
54
+
55
+ // First perform some pre-shuffling
56
+ <in0_type>* shuffled_<in0_name> = 0;
57
+ cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
58
+ dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
59
+ dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
60
+ bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
61
+ <in0_type>* temp_<in0_name> = <in0_name>;
62
+ <in0_name> = shuffled_<in0_name>;
63
+ cudaFree(temp_<in0_name>);
64
+
65
+ // Then run the original kernel
66
+ dim3 bones_threads0(bones_block_size);
67
+ dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
68
+ bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
69
+ }
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,42 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel
5
+ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
6
+ const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
7
+ int bones_local_id = threadIdx.x;
8
+ if (bones_global_id < <in0_dimensions>) {
9
+
10
+ // Calculate the local and global ID(s) based on the thread id
11
+ int bones_local_id_0 = bones_local_id;
12
+ <out0_ids>
13
+
14
+ // Load the input data into local memory
15
+ __shared__ <in0_type> bones_local_memory_<in0_name>[512+<in0_parameter0_sum>];
16
+ bones_local_id_0 = bones_local_id_0-(<in0_parameter0_from>);
17
+ bones_local_memory_<in0_name>[bones_local_id_0] = <in0_name>[bones_global_id_0];
18
+
19
+ // Load the left border into local memory
20
+ if (threadIdx.x < -(<in0_parameter0_from>)) {
21
+ bones_local_memory_<in0_name>[bones_local_id_0+<in0_parameter0_from>] = <in0_name>[bones_global_id_0+<in0_parameter0_from>];
22
+ }
23
+
24
+ // Load the right border into local memory
25
+ if ((threadIdx.x >= 512-<in0_parameter0_to>) || (bones_global_id_0 >= <in0_dimensions>-<in0_parameter0_to>)) {
26
+ bones_local_memory_<in0_name>[bones_local_id_0+<in0_parameter0_to>] = <in0_name>[bones_global_id_0+<in0_parameter0_to>];
27
+ }
28
+
29
+ // Synchronize all the threads in a threadblock
30
+ __syncthreads();
31
+
32
+ // Perform the main computation
33
+ <algorithm_code1>
34
+ }
35
+ }
36
+
37
+ // Function to start the kernel
38
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
39
+ dim3 bones_threads(512);
40
+ dim3 bones_grid(DIV_CEIL(<in0_dimensions>,512));
41
+ bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
42
+ }
@@ -0,0 +1,3 @@
1
+
2
+ // Start the CUDA function
3
+ bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,28 @@
1
+ /* STARTDEF
2
+ void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel
5
+ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
6
+ const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
7
+ if (bones_global_id < (<parallelism>)) {
8
+
9
+ // Calculate the global ID(s) based on the thread id
10
+ <ids>
11
+
12
+ // Start the computation
13
+ <algorithm_code1>
14
+ }
15
+ }
16
+
17
+ // Function to start the kernel
18
+ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
19
+ int bones_block_size;
20
+ if (<parallelism> >= 64*512) { bones_block_size = 512;}
21
+ else if (<parallelism> >= 64*256) { bones_block_size = 256;}
22
+ else if (<parallelism> >= 64*128) { bones_block_size = 128;}
23
+ else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
24
+ else { bones_block_size = 32; }
25
+ dim3 bones_threads(bones_block_size);
26
+ dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
27
+ bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
28
+ }
@@ -0,0 +1,30 @@
1
+ ###################################################################
2
+ # Each line holds one mapping from species to skeleton
3
+ # The ordering is always ['chunk','neighbourhood','element','shared','void']
4
+ # The pattern 'full' is omitted from matching (will thus always match)
5
+ # 'D' denotes any ranges (e.g. D|element can be any dimension)
6
+ # 'N' denotes any range (e.g. N,N|element must be 2D)
7
+ # '+' denotes one or more of these patterns
8
+ ###################################################################
9
+ D|chunk(D)+ -> D|chunk(D)+ :default :00
10
+ D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
11
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
12
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
13
+ N,N|chunk(1,N) ^ N,N|chunk(1,N)+ -> D|element+ :2xN-N-chunk-1-N-to-D-element :30
14
+ N,N|chunk(1,N) ^ N,N|chunk(1,N)+ ^ D|element+ -> D|element+ :2xN-N-chunk-1-N-to-D-element :30
15
+ N,N|chunk(1,N)+ -> D|element+ :N-N-chunk-1-N-to-D-element :20
16
+ N,N|chunk(1,N)+ ^ D|element+ -> D|element+ :N-N-chunk-1-N-to-D-element :20
17
+ N,N|chunk(D)+ -> N,N|element+ :default :40
18
+ N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :default :40
19
+ D|chunk(D)+ -> D|element+ :default :00
20
+ D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
21
+ D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
22
+ N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
23
+ D|neighbourhood(D)+ -> D|element+ :default :00
24
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
25
+ D|element+ -> D|chunk(D)+ :default :00
26
+ D|element+ -> D|element+ :default :00
27
+ D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
28
+ D|element+ -> D|shared+ :default :08
29
+ D|element+ -> D|element+ ^ D|shared+ :default :08
30
+ D|void -> D|element+ :default :00
@@ -0,0 +1,3 @@
1
+
2
+ // Clean-up the OpenCL context
3
+ fflush(stdout);
@@ -0,0 +1,155 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <math.h>
5
+ #include <sys/time.h>
6
+ #include <CL/cl.h>
7
+
8
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
9
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
10
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
11
+ #define DIV_FLOOR(a,b) (a/b)
12
+
13
+ // Multiple iterations for kernel measurements
14
+ #define ITERS 1
15
+
16
+ // Load the OpenCL kernel from file
17
+ char * get_source(const char* bones_filename) {
18
+ FILE* bones_fp = fopen(bones_filename,"r");
19
+ fseek(bones_fp,0,SEEK_END);
20
+ long bones_size = ftell(bones_fp);
21
+ rewind(bones_fp);
22
+ char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
23
+ int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
24
+ bones_source[bones_size] = '\0';
25
+ fclose(bones_fp);
26
+ return bones_source;
27
+ }
28
+
29
+ // Print an error if it occurs
30
+ void error_check(cl_int bones_errors) {
31
+ if(bones_errors != CL_SUCCESS) {
32
+ switch (bones_errors) {
33
+ case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
34
+ case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
35
+ case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
36
+ case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
37
+ case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
38
+ case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
39
+ case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
40
+ case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
41
+ case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
42
+ case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
43
+ case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
44
+ case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
45
+ case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
46
+ case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
47
+ case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
48
+ case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
49
+ case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
50
+ case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
51
+ case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
52
+ case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
53
+ case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
54
+ case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
55
+ case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
56
+ case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
57
+ case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
58
+ case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
59
+ case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
60
+ case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
61
+ case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
62
+ case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
63
+ case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
64
+ case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
65
+ case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
66
+ case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
67
+ case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
68
+ case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
69
+ case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
70
+ case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
71
+ case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
72
+ case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
73
+ case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
74
+ case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
75
+ case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
76
+ case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
77
+ case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
78
+ default: printf("--- Error: Unknown with code %d\n", bones_errors);
79
+ }
80
+ fflush(stdout); exit(0);
81
+ }
82
+ }
83
+
84
+ // Use a global variable for the device ID, context and command queue
85
+ cl_device_id bones_device;
86
+ cl_context bones_context;
87
+ cl_command_queue bones_queue;
88
+
89
+ // Use a global variable to store the name and the binary for the last program
90
+ char bones_last_program[1024];
91
+ cl_program bones_program;
92
+
93
+ // Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
94
+ void bones_initialize_target(void) {
95
+ cl_int bones_errors;
96
+
97
+ // Get OpenCL platform count
98
+ cl_uint bones_num_platforms;
99
+ bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
100
+ if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
101
+
102
+ // Get all OpenCL platform IDs
103
+ cl_platform_id bones_platform_ids[10];
104
+ bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
105
+
106
+ // Select the AMD APP platform
107
+ char bones_buffer[1024];
108
+ cl_uint bones_platform;
109
+ for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
110
+ clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
111
+ if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
112
+ }
113
+
114
+ // Get a CPU device on the platform
115
+ bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_GPU, 1, &bones_device, NULL); error_check(bones_errors);
116
+ bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
117
+
118
+ // Create a context
119
+ bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
120
+
121
+ // Create a command queue
122
+ bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
123
+
124
+ // Create space on the device
125
+ cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
126
+
127
+ // Copy something to the device
128
+ bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
129
+
130
+ // Clean-up the OpenCL context
131
+ strcpy(bones_last_program,"");
132
+ clReleaseMemObject(bones_device_data);
133
+ clReleaseContext(bones_context);
134
+ fflush(stdout);
135
+ }
136
+
137
+ // Declaration of the original function
138
+ int bones_main(void);
139
+
140
+ // New main function for initialisation and clean-up
141
+ int main(void) {
142
+
143
+ // Initialisation
144
+ bones_initialize_target();
145
+
146
+ // Original main function
147
+ int bones_return = bones_main();
148
+
149
+ // Clean-up
150
+ clReleaseCommandQueue(bones_queue);
151
+ clReleaseProgram(bones_program);
152
+ clReleaseContext(bones_context);
153
+ return bones_return;
154
+ }
155
+
@@ -0,0 +1,4 @@
1
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
2
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
3
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
4
+ #define DIV_FLOOR(a,b) (a/b)
File without changes
@@ -0,0 +1,4 @@
1
+
2
+ // Perform a copy of <array> from device to host
3
+ clEnqueueReadBuffer(bones_queue,device_<array>,CL_TRUE,(<offset>)*sizeof(<type>),<variable_dimensions>*sizeof(<type>),<array><flatten>+<offset>,0,NULL,NULL);
4
+ clFinish(bones_queue);
@@ -0,0 +1,4 @@
1
+
2
+ // Copy <array> to the device
3
+ device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,NULL);
4
+ clFinish(bones_queue);
@@ -0,0 +1,3 @@
1
+
2
+ // Clean up GPU arrays
3
+ clReleaseMemObject(device_<array>);
@@ -0,0 +1,3 @@
1
+
2
+ // Create a device pointer for <array>
3
+ cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,<variable_dimensions>*sizeof(<type>),NULL,&bones_errors); error_check(bones_errors);
@@ -0,0 +1,24 @@
1
+ fflush(stdout);
2
+ cl_int bones_errors;
3
+ cl_event bones_event;
4
+
5
+ // Only compile if this program is different from the last one
6
+ if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
7
+ strcpy(bones_last_program,"<algorithm_filename>");
8
+
9
+ // Load and compile the kernel
10
+ char *bones_source = get_source("<algorithm_filename>_device.cl");
11
+ bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
12
+ bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
13
+
14
+ // Get and print the compiler log
15
+ char* bones_log;
16
+ size_t bones_log_size;
17
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
18
+ bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
19
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
20
+ bones_log[bones_log_size] = '\0';
21
+ //if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
22
+ free(bones_log);
23
+ error_check(bones_errors);
24
+ }
@@ -0,0 +1,5 @@
1
+
2
+ // Start the timer for the measurement of the kernel and memory copy execution time
3
+ struct timeval bones_start_time1;
4
+ clFinish(bones_queue);
5
+ gettimeofday(&bones_start_time1, NULL);
@@ -0,0 +1,9 @@
1
+
2
+ // End the timer for the measurement of the kernel and memory copy execution time
3
+ #if (ITERS == 1)
4
+ clFinish(bones_queue);
5
+ struct timeval bones_end_time1;
6
+ gettimeofday(&bones_end_time1, NULL);
7
+ float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
9
+ #endif
@@ -0,0 +1,4 @@
1
+
2
+ // Start the timer for the measurement of the kernel execution time
3
+ clFinish(bones_queue);
4
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
@@ -0,0 +1,11 @@
1
+
2
+ }
3
+
4
+ // Stop the timer for the measurement of the kernel execution time
5
+ clFinish(bones_queue);
6
+ cl_ulong end2, start2;
7
+ bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
8
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
9
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
10
+ float bones_timer2 = 0.000001 * (end2-start2);
11
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);