bones-compiler 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,72 @@
1
+
2
+ // Start of the <algorithm_name> kernel (main, not unrolled kernel)
3
+ __kernel void bones_kernel_<algorithm_name>_0(int bones_input_size, __global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
4
+ const int bones_threadblock_work = DIV_CEIL(bones_input_size,get_num_groups(0));
5
+ const int bones_parallel_work = BONES_MIN(get_local_size(0),bones_threadblock_work);
6
+ const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
7
+ const int bones_local_id = get_local_id(0);
8
+ const int bones_global_id = get_global_id(0);
9
+ <ids>
10
+ int bones_iter_id = <in0_flatindex>;
11
+
12
+ // Load data into thread private memory and perform the first computation(s) sequentially
13
+ <in0_type> bones_temporary = <in0_name>[bones_iter_id];
14
+ <in0_type> bones_private_memory = <algorithm_code3>;
15
+ for(int c=1; c<bones_sequential_work; c++) {
16
+ bones_iter_id = bones_iter_id + bones_parallel_work*get_num_groups(0)<factors>;
17
+ if (bones_iter_id <= <in0_to>) {
18
+ bones_temporary = <in0_name>[bones_iter_id];
19
+ bones_private_memory = <algorithm_code1>;
20
+ }
21
+ }
22
+ // Initialize the local memory
23
+ volatile __local <in0_type> bones_local_memory[256];
24
+ bones_local_memory[bones_local_id] = bones_private_memory;
25
+ barrier(CLK_LOCAL_MEM_FENCE);
26
+
27
+ // Perform the remainder of the computations in parallel using a parallel reduction tree
28
+ int bones_offset_id;
29
+ for (int c=256; c>=2; c=c>>1) {
30
+ if ((2*bones_parallel_work > c) && (get_local_id(0) < c/2)) {
31
+ bones_offset_id = get_local_id(0)+c/2;
32
+ if (bones_offset_id < bones_parallel_work) {
33
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
34
+ }
35
+ }
36
+ barrier(CLK_LOCAL_MEM_FENCE);
37
+ }
38
+
39
+ // Write the final result back to the global memory
40
+ if (get_local_id(0) == 0) { <out0_name>[get_group_id(0)] = bones_local_memory[0]; }
41
+ }
42
+
43
+ // Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
44
+ __kernel void bones_kernel_<algorithm_name>_1(__global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>) {
45
+ const int bones_local_id = get_local_id(0);
46
+ const int bones_global_id = get_local_id(0);
47
+
48
+ // Initialize the local memory
49
+ volatile __local <in0_type> bones_local_memory[128];
50
+ bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
51
+ barrier(CLK_LOCAL_MEM_FENCE);
52
+
53
+ // Perform reduction using a parallel reduction tree
54
+ int bones_offset_id;
55
+ for (int c=128; c>=2; c=c>>1) {
56
+ if (get_local_id(0) < c/2) {
57
+ bones_offset_id = get_local_id(0)+c/2;
58
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
59
+ }
60
+ barrier(CLK_LOCAL_MEM_FENCE);
61
+ }
62
+
63
+ // Write the final result back to the global memory
64
+ if (get_local_id(0) == 0) { <out0_name>[0] = bones_local_memory[0]; }
65
+ }
66
+
67
+ // Start of the <algorithm_name> kernel (final, initial value kernel)
68
+ __kernel void bones_kernel_<algorithm_name>_2(__global <out0_type><out0_devicepointer> bones_initial_value, __global <out0_type><out0_devicepointer> <out0_name>) {
69
+ <out0_type> bones_private_memory = <out0_name>[0];
70
+ <out0_type> bones_temporary = bones_initial_value[0];
71
+ <out0_name>[0] = <algorithm_code4>;
72
+ }
@@ -0,0 +1,14 @@
1
+
2
+ // Create the kernel
3
+ cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
4
+
5
+ // Set all the arguments to the kernel function
6
+ int bones_num_args = 0;
7
+ <kernel_argument_list>
8
+ // Start the kernel
9
+ size_t bones_global_worksize[] = {DIV_CEIL(<parallelism>,8)*8};
10
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize,NULL,0,NULL,&bones_event); error_check(bones_errors);
11
+
12
+ // Synchronize and clean-up the kernel
13
+ clFinish(bones_queue);
14
+ clReleaseKernel(bones_kernel_<algorithm_name>_0);
@@ -0,0 +1,13 @@
1
+
2
+ // Start of the <algorithm_name> kernel
3
+ __kernel void bones_kernel_<algorithm_name>_0(<devicedefinitionsopencl>, <argument_definition>) {
4
+ const int bones_global_id = get_global_id(0);
5
+ if (bones_global_id < (<parallelism>)) {
6
+
7
+ // Calculate the global ID(s) based on the thread id
8
+ <ids>
9
+
10
+ // Start the computation
11
+ <algorithm_code1>
12
+ }
13
+ }
@@ -0,0 +1,26 @@
1
+ ###################################################################
2
+ # Each line holds one mapping from species to skeleton
3
+ # The ordering is always ['chunk','neighbourhood','element','shared','void']
4
+ # The pattern 'full' is omitted from matching (will thus always match)
5
+ # 'D' denotes any ranges (e.g. D|element can be any dimension)
6
+ # 'N' denotes any range (e.g. N,N|element must be 2D)
7
+ # '+' denotes one or more of these patterns
8
+ ###################################################################
9
+ D|chunk(D)+ -> D|chunk(D)+ :default :00
10
+ D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
11
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
12
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
13
+ D|chunk(D)+ -> D|element+ :default :00
14
+ D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
15
+ D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
16
+ N|neighbourhood(N)+ -> N|element+ :default :00
17
+ D|neighbourhood(D)+ -> D|element+ :default :00
18
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
19
+ D|element+ -> D|chunk(D)+ :default :00
20
+ D|element+ -> D|element+ :default :00
21
+ D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
22
+ D|void -> D|element+ :default :00
23
+
24
+ #D|element+ -> D|shared+ :default :09
25
+ #D|element+ -> D|element+ ^ D|shared+ :default :09
26
+
File without changes
@@ -0,0 +1,37 @@
1
+ #include <omp.h>
2
+ #include <stdlib.h>
3
+
4
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
5
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
6
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
7
+ #define DIV_FLOOR(a,b) (a/b)
8
+
9
+ // Multiple iterations for kernel measurements
10
+ #define ITERS 1
11
+
12
+ // Function to initialize the CPU platform (for fair measurements)
13
+ void bones_initialize_target(void) {
14
+ int bones_thread_count = omp_get_num_procs();
15
+ omp_set_num_threads(bones_thread_count);
16
+ #pragma omp parallel
17
+ {
18
+ int bones_thread_id = omp_get_thread_num();
19
+ }
20
+ }
21
+
22
+ // Declaration of the original function
23
+ int bones_main(void);
24
+
25
+ // New main function for initialisation and clean-up
26
+ int main(void) {
27
+
28
+ // Initialisation
29
+ bones_initialize_target();
30
+
31
+ // Original main function
32
+ int bones_return = bones_main();
33
+
34
+ // Clean-up
35
+ return bones_return;
36
+ }
37
+
@@ -0,0 +1,6 @@
1
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
2
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
3
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
4
+ #define DIV_FLOOR(a,b) (a/b)
5
+
6
+ #include <math.h>
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,3 @@
1
+
2
+ // Create a pointer to <array> on the device
3
+ <type>* device_<array> = <array><flatten>;
File without changes
@@ -0,0 +1,12 @@
1
+
2
+ // Flush the CPU cache (for measurement purposes only)
3
+ const int bones_flush_size = 4*1024*1024; // (16MB)
4
+ int bones_flush_i;
5
+ int bones_flush_j;
6
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
7
+ for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
8
+ for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
9
+ bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
10
+ }
11
+ }
12
+ free(bones_flush_c);
File without changes
@@ -0,0 +1,18 @@
1
+ // Initialize the timer
2
+ float bones_timer2 = 0;
3
+ struct timeval bones_start_time2;
4
+ struct timeval bones_end_time2;
5
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
6
+
7
+ // Flush the CPU cache (for measurement purposes only)
8
+ const int bones_flush_size = 4*1024*1024; // (16MB)
9
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
10
+ for (int i=0; i<10; i++) {
11
+ for (int j=0; j<bones_flush_size; j++) {
12
+ bones_flush_c[j] = i*j;
13
+ }
14
+ }
15
+ free(bones_flush_c);
16
+
17
+ // Start the timer for the measurement of the kernel execution time
18
+ gettimeofday(&bones_start_time2, NULL);
@@ -0,0 +1,8 @@
1
+
2
+ // Stop the timer for the measurement of the kernel execution time
3
+ gettimeofday(&bones_end_time2, NULL);
4
+ bones_timer2 += 0.001 * (1000000*(bones_end_time2.tv_sec-bones_start_time2.tv_sec)+bones_end_time2.tv_usec-bones_start_time2.tv_usec);
5
+ }
6
+
7
+ // Print the measurement data
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2/((float)ITERS));
@@ -0,0 +1,27 @@
1
+
2
+ if (<in0_dimensions> > 0) {
3
+
4
+ // Store the initial value
5
+ <out0_type> bones_initial_value = <out0_name>[0];
6
+
7
+ // Create a temporary array to store intermediate data
8
+ int bones_thread_count = BONES_MIN(omp_get_num_procs(),<in0_dimensions>);
9
+ <out0_type>* bones_temporary = (<out0_type>*)malloc(bones_thread_count*sizeof(<out0_type>));
10
+
11
+ // Run multiple OpenMP threads
12
+ omp_set_num_threads(bones_thread_count);
13
+ #pragma omp parallel
14
+ {
15
+ int bones_thread_id = omp_get_thread_num();
16
+
17
+ // Perform the major part of the computation in parallel
18
+ bones_kernel_<algorithm_name>_0(bones_thread_id, bones_thread_count, <in0_dimensions>, <in_devicenames>, bones_temporary, <argument_name>);
19
+ }
20
+
21
+ // Compute the second part of the algorithm with only one thread
22
+ bones_kernel_<algorithm_name>_1(bones_thread_count, bones_temporary, <out_devicenames>, <argument_name>);
23
+ free(bones_temporary);
24
+
25
+ // Perform the last computation (only needed if there is an initial value)
26
+ bones_kernel_<algorithm_name>_2(bones_initial_value,<out0_name>,<argument_name>);
27
+ }
@@ -0,0 +1,46 @@
1
+ /* STARTDEF
2
+ void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, int bones_size, <devicedefinitions>, <argument_definition>);
3
+ void bones_kernel_<algorithm_name>_1(int bones_size, <devicedefinitions>, <argument_definition>);
4
+ void bones_kernel_<algorithm_name>_2(<out0_type> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>);
5
+ ENDDEF */
6
+ // Start of the <algorithm_name> kernel (main part)
7
+ void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, int bones_size, <devicedefinitions>, <argument_definition>) {
8
+ const int bones_work = DIV_CEIL(bones_size,bones_thread_count);
9
+ const int bones_global_id = bones_thread_id;
10
+ <ids>
11
+ int bones_iter_id = <in0_flatindex>;
12
+
13
+ // Use a thread private memory to perform the per-thread computation(s)
14
+ <in0_type> bones_temporary = <in0_name>[bones_iter_id];
15
+ <in0_type> bones_private_memory = <algorithm_code2>;
16
+ for(int c=1; c<bones_work; c++) {
17
+ bones_iter_id = bones_iter_id + bones_thread_count<factors>;
18
+ if (bones_iter_id <= <in0_to>) {
19
+ bones_temporary = <in0_name>[bones_iter_id];
20
+ bones_private_memory = <algorithm_code1>;
21
+ }
22
+ }
23
+
24
+ // Store the result
25
+ <out0_name>[bones_thread_id] = bones_private_memory;
26
+ }
27
+
28
+ // Start of the <algorithm_name> kernel (secondary part)
29
+ void bones_kernel_<algorithm_name>_1(int bones_size, <devicedefinitions>, <argument_definition>) {
30
+
31
+ // Use a private memory to perform the sequential computation(s)
32
+ <in0_type> bones_private_memory = <in0_name>[0];
33
+ for(int bones_iter_id=1; bones_iter_id<bones_size; bones_iter_id++) {
34
+ bones_private_memory = bones_private_memory + <in0_name>[bones_iter_id];
35
+ }
36
+
37
+ // Store the result
38
+ <out0_name>[0] = bones_private_memory;
39
+ }
40
+
41
+ // Start of the <algorithm_name> kernel (final, initial value kernel)
42
+ void bones_kernel_<algorithm_name>_2(<out0_type> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
43
+ <out0_type> bones_private_memory = <out0_name>[0];
44
+ <out0_type> bones_temporary = bones_initial_value;
45
+ <out0_name>[0] = <algorithm_code3>;
46
+ }
@@ -0,0 +1,11 @@
1
+
2
+ // Run multiple OpenMP threads
3
+ int bones_thread_count = omp_get_num_procs();
4
+ omp_set_num_threads(bones_thread_count);
5
+ #pragma omp parallel
6
+ {
7
+ int bones_thread_id = omp_get_thread_num();
8
+
9
+ // Start the kernel
10
+ bones_kernel_<algorithm_name>_0(bones_thread_id, bones_thread_count, <devicenames>, <argument_name>);
11
+ }
@@ -0,0 +1,18 @@
1
+ /* STARTDEF
2
+ void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, <devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel
5
+ void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, <devicedefinitions>, <argument_definition>) {
6
+ int bones_workload = DIV_CEIL(<parallelism>,bones_thread_count);
7
+ int bones_start = bones_thread_id*bones_workload;
8
+ int bones_end = BONES_MIN((bones_thread_id+1)*bones_workload,<parallelism>);
9
+ for(int bones_global_id=bones_start; bones_global_id<bones_end; bones_global_id++) {
10
+
11
+ // Calculate the global ID(s) based on the thread id
12
+ <ids>
13
+
14
+ // Perform the main computation
15
+ <algorithm_code1>
16
+ }
17
+ }
18
+
@@ -0,0 +1,26 @@
1
+ ###################################################################
2
+ # Each line holds one mapping from species to skeleton
3
+ # The ordering is always ['chunk','neighbourhood','element','shared','void']
4
+ # The pattern 'full' is omitted from matching (will thus always match)
5
+ # 'D' denotes any ranges (e.g. D|element can be any dimension)
6
+ # 'N' denotes any range (e.g. N,N|element must be 2D)
7
+ # '+' denotes one or more of these patterns
8
+ ###################################################################
9
+ D|chunk(D)+ -> D|chunk(D)+ :default :00
10
+ D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
11
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
12
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
13
+ D|chunk(D)+ -> D|element+ :default :00
14
+ D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
15
+ D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
16
+ N|neighbourhood(N)+ -> N|element+ :default :00
17
+ D|neighbourhood(D)+ -> D|element+ :default :00
18
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
19
+ D|element+ -> D|chunk(D)+ :default :00
20
+ D|element+ -> D|element+ :default :00
21
+ D|element -> 1|shared :D-element-to-1-shared :02 04 05
22
+ D|void -> D|element+ :default :00
23
+
24
+ #D|element+ -> D|shared+ :default :00
25
+ #D|element+ -> D|element+ ^ D|shared+ :default :00
26
+
File without changes
@@ -0,0 +1,31 @@
1
+ #include <stdio.h>
2
+ #include <cuda_runtime.h>
3
+
4
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
5
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
6
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
7
+ #define DIV_FLOOR(a,b) (a/b)
8
+
9
+ // Function to initialize the GPU (for fair measurements)
10
+ void bones_initialize_target(void) {
11
+ int* bones_temporary = 0;
12
+ cudaMalloc((void**)&bones_temporary, sizeof(int));
13
+ cudaFree(bones_temporary);
14
+ }
15
+
16
+ // Declaration of the original function
17
+ int bones_main(void);
18
+
19
+ // New main function for initialisation and clean-up
20
+ int main(void) {
21
+
22
+ // Initialisation of the target
23
+ bones_initialize_target();
24
+
25
+ // Original main function
26
+ int bones_return = bones_main();
27
+
28
+ // Clean-up
29
+ return bones_return;
30
+ }
31
+
@@ -0,0 +1,4 @@
1
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
2
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
3
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
4
+ #define DIV_FLOOR(a,b) (a/b)
File without changes
@@ -0,0 +1,3 @@
1
+
2
+ // Copy <array> from device to host
3
+ cudaMemcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost);
@@ -0,0 +1,3 @@
1
+
2
+ // Copy <array> to the device
3
+ cudaMemcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice);
@@ -0,0 +1,3 @@
1
+
2
+ // Clean up GPU arrays
3
+ cudaFree(device_<array>);
@@ -0,0 +1,5 @@
1
+
2
+ // Create space for <array> on the device
3
+ <type>* device_<array> = 0;
4
+ cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
5
+ cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));