bones-compiler 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,72 @@
1
+
2
+ // Start of the <algorithm_name> kernel (main, not unrolled kernel)
3
+ __kernel void bones_kernel_<algorithm_name>_0(int bones_input_size, __global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
4
+ const int bones_threadblock_work = DIV_CEIL(bones_input_size,get_num_groups(0));
5
+ const int bones_parallel_work = BONES_MIN(get_local_size(0),bones_threadblock_work);
6
+ const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
7
+ const int bones_local_id = get_local_id(0);
8
+ const int bones_global_id = get_global_id(0);
9
+ <ids>
10
+ int bones_iter_id = <in0_flatindex>;
11
+
12
+ // Load data into thread private memory and perform the first computation(s) sequentially
13
+ <in0_type> bones_temporary = <in0_name>[bones_iter_id];
14
+ <in0_type> bones_private_memory = <algorithm_code3>;
15
+ for(int c=1; c<bones_sequential_work; c++) {
16
+ bones_iter_id = bones_iter_id + bones_parallel_work*get_num_groups(0)<factors>;
17
+ if (bones_iter_id <= <in0_to>) {
18
+ bones_temporary = <in0_name>[bones_iter_id];
19
+ bones_private_memory = <algorithm_code1>;
20
+ }
21
+ }
22
+ // Initialize the local memory
23
+ volatile __local <in0_type> bones_local_memory[256];
24
+ bones_local_memory[bones_local_id] = bones_private_memory;
25
+ barrier(CLK_LOCAL_MEM_FENCE);
26
+
27
+ // Perform the remainder of the computations in parallel using a parallel reduction tree
28
+ int bones_offset_id;
29
+ for (int c=256; c>=2; c=c>>1) {
30
+ if ((2*bones_parallel_work > c) && (get_local_id(0) < c/2)) {
31
+ bones_offset_id = get_local_id(0)+c/2;
32
+ if (bones_offset_id < bones_parallel_work) {
33
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
34
+ }
35
+ }
36
+ barrier(CLK_LOCAL_MEM_FENCE);
37
+ }
38
+
39
+ // Write the final result back to the global memory
40
+ if (get_local_id(0) == 0) { <out0_name>[get_group_id(0)] = bones_local_memory[0]; }
41
+ }
42
+
43
+ // Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
44
+ __kernel void bones_kernel_<algorithm_name>_1(__global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>) {
45
+ const int bones_local_id = get_local_id(0);
46
+ const int bones_global_id = get_local_id(0);
47
+
48
+ // Initialize the local memory
49
+ volatile __local <in0_type> bones_local_memory[128];
50
+ bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
51
+ barrier(CLK_LOCAL_MEM_FENCE);
52
+
53
+ // Perform reduction using a parallel reduction tree
54
+ int bones_offset_id;
55
+ for (int c=128; c>=2; c=c>>1) {
56
+ if (get_local_id(0) < c/2) {
57
+ bones_offset_id = get_local_id(0)+c/2;
58
+ bones_local_memory[bones_local_id] = <algorithm_code2>;
59
+ }
60
+ barrier(CLK_LOCAL_MEM_FENCE);
61
+ }
62
+
63
+ // Write the final result back to the global memory
64
+ if (get_local_id(0) == 0) { <out0_name>[0] = bones_local_memory[0]; }
65
+ }
66
+
67
+ // Start of the <algorithm_name> kernel (final, initial value kernel)
68
+ __kernel void bones_kernel_<algorithm_name>_2(__global <out0_type><out0_devicepointer> bones_initial_value, __global <out0_type><out0_devicepointer> <out0_name>) {
69
+ <out0_type> bones_private_memory = <out0_name>[0];
70
+ <out0_type> bones_temporary = bones_initial_value[0];
71
+ <out0_name>[0] = <algorithm_code4>;
72
+ }
@@ -0,0 +1,14 @@
1
+
2
+ // Create the kernel
3
+ cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
4
+
5
+ // Set all the arguments to the kernel function
6
+ int bones_num_args = 0;
7
+ <kernel_argument_list>
8
+ // Start the kernel
9
+ size_t bones_global_worksize[] = {DIV_CEIL(<parallelism>,8)*8};
10
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize,NULL,0,NULL,&bones_event); error_check(bones_errors);
11
+
12
+ // Synchronize and clean-up the kernel
13
+ clFinish(bones_queue);
14
+ clReleaseKernel(bones_kernel_<algorithm_name>_0);
@@ -0,0 +1,13 @@
1
+
2
+ // Start of the <algorithm_name> kernel
3
+ __kernel void bones_kernel_<algorithm_name>_0(<devicedefinitionsopencl>, <argument_definition>) {
4
+ const int bones_global_id = get_global_id(0);
5
+ if (bones_global_id < (<parallelism>)) {
6
+
7
+ // Calculate the global ID(s) based on the thread id
8
+ <ids>
9
+
10
+ // Start the computation
11
+ <algorithm_code1>
12
+ }
13
+ }
@@ -0,0 +1,26 @@
1
+ ###################################################################
2
+ # Each line holds one mapping from species to skeleton
3
+ # The ordering is always ['chunk','neighbourhood','element','shared','void']
4
+ # The pattern 'full' is omitted from matching (will thus always match)
5
+ # 'D' denotes any ranges (e.g. D|element can be any dimension)
6
+ # 'N' denotes any range (e.g. N,N|element must be 2D)
7
+ # '+' denotes one or more of these patterns
8
+ ###################################################################
9
+ D|chunk(D)+ -> D|chunk(D)+ :default :00
10
+ D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
11
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
12
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
13
+ D|chunk(D)+ -> D|element+ :default :00
14
+ D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
15
+ D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
16
+ N|neighbourhood(N)+ -> N|element+ :default :00
17
+ D|neighbourhood(D)+ -> D|element+ :default :00
18
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
19
+ D|element+ -> D|chunk(D)+ :default :00
20
+ D|element+ -> D|element+ :default :00
21
+ D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
22
+ D|void -> D|element+ :default :00
23
+
24
+ #D|element+ -> D|shared+ :default :09
25
+ #D|element+ -> D|element+ ^ D|shared+ :default :09
26
+
@@ -0,0 +1,3 @@
1
+
2
+ // Clean-up the OpenCL context
3
+ fflush(stdout);
@@ -0,0 +1,154 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <math.h>
5
+ #include <sys/time.h>
6
+ #include <CL/cl.h>
7
+
8
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
9
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
10
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
11
+ #define DIV_FLOOR(a,b) (a/b)
12
+
13
+ // Multiple iterations for kernel measurements
14
+ #define ITERS 1
15
+
16
+ // Load the OpenCL kernel from file
17
+ char * get_source(const char* bones_filename) {
18
+ FILE* bones_fp = fopen(bones_filename,"r");
19
+ fseek(bones_fp,0,SEEK_END);
20
+ long bones_size = ftell(bones_fp);
21
+ rewind(bones_fp);
22
+ char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
23
+ int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
24
+ bones_source[bones_size] = '\0';
25
+ fclose(bones_fp);
26
+ return bones_source;
27
+ }
28
+
29
+ // Print an error if it occurs
30
+ void error_check(cl_int bones_errors) {
31
+ if(bones_errors != CL_SUCCESS) {
32
+ switch (bones_errors) {
33
+ case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
34
+ case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
35
+ case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
36
+ case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
37
+ case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
38
+ case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
39
+ case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
40
+ case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
41
+ case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
42
+ case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
43
+ case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
44
+ case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
45
+ case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
46
+ case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
47
+ case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
48
+ case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
49
+ case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
50
+ case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
51
+ case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
52
+ case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
53
+ case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
54
+ case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
55
+ case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
56
+ case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
57
+ case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
58
+ case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
59
+ case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
60
+ case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
61
+ case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
62
+ case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
63
+ case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
64
+ case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
65
+ case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
66
+ case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
67
+ case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
68
+ case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
69
+ case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
70
+ case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
71
+ case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
72
+ case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
73
+ case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
74
+ case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
75
+ case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
76
+ case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
77
+ case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
78
+ default: printf("--- Error: Unknown with code %d\n", bones_errors);
79
+ }
80
+ fflush(stdout); exit(0);
81
+ }
82
+ }
83
+
84
+ // Use a global variable for the device ID, context and command queue
85
+ cl_device_id bones_device;
86
+ cl_context bones_context;
87
+ cl_command_queue bones_queue;
88
+
89
+ // Use a global variable to store the name and the binary for the last program
90
+ char bones_last_program[1024];
91
+ cl_program bones_program;
92
+
93
+ // Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
94
+ void bones_initialize_target(void) {
95
+ cl_int bones_errors;
96
+
97
+ // Get OpenCL platform count
98
+ cl_uint bones_num_platforms;
99
+ bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
100
+ if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
101
+
102
+ // Get all OpenCL platform IDs
103
+ cl_platform_id bones_platform_ids[10];
104
+ bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
105
+
106
+ // Select the AMD APP platform
107
+ char bones_buffer[1024];
108
+ cl_uint bones_platform;
109
+ for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
110
+ clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
111
+ if(strstr(bones_buffer,"Intel") != NULL) { bones_platform = bones_platform_id; break; }
112
+ }
113
+
114
+ // Get a CPU device on the platform
115
+ bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_CPU, 1, &bones_device, NULL); error_check(bones_errors);
116
+ bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
117
+
118
+ // Create a context
119
+ bones_context = clCreateContext(NULL,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
120
+
121
+ // Create a command queue
122
+ bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
123
+
124
+ // Create space on the device
125
+ cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
126
+
127
+ // Copy something to the device
128
+ bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
129
+
130
+ // Clean-up the OpenCL context
131
+ strcpy(bones_last_program,"");
132
+ clReleaseMemObject(bones_device_data);
133
+ fflush(stdout);
134
+ }
135
+
136
+ // Declaration of the original function
137
+ int bones_main(void);
138
+
139
+ // New main function for initialisation and clean-up
140
+ int main(void) {
141
+
142
+ // Initialisation
143
+ bones_initialize_target();
144
+
145
+ // Original main function
146
+ int bones_return = bones_main();
147
+
148
+ // Clean-up
149
+ clReleaseCommandQueue(bones_queue);
150
+ clReleaseProgram(bones_program);
151
+ clReleaseContext(bones_context);
152
+ return bones_return;
153
+ }
154
+
@@ -0,0 +1,4 @@
1
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
2
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
3
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
4
+ #define DIV_FLOOR(a,b) (a/b)
@@ -0,0 +1,31 @@
1
+ #include <stdlib.h>
2
+
3
+ // Allocate a 128-byte aligned pointer
4
+ void *bones_malloc_128(size_t bones_size) {
5
+ char *bones_pointer;
6
+ char *bones_pointer2;
7
+ char *bones_aligned_pointer;
8
+
9
+ // Allocate the memory plus a little bit extra
10
+ bones_pointer = (char *)malloc(bones_size + 128 + sizeof(int));
11
+ if(bones_pointer==NULL) { return(NULL); }
12
+
13
+ // Create the aligned pointer
14
+ bones_pointer2 = bones_pointer + sizeof(int);
15
+ bones_aligned_pointer = bones_pointer2 + (128 - ((size_t)bones_pointer2 & 127));
16
+
17
+ // Set the padding size
18
+ bones_pointer2 = bones_aligned_pointer - sizeof(int);
19
+ *((int *)bones_pointer2) = (int)(bones_aligned_pointer - bones_pointer);
20
+
21
+ // Return the 128-byte aligned pointer
22
+ return (bones_aligned_pointer);
23
+ }
24
+
25
+ // Free the 128-byte aligned pointer
26
+ void bones_free_128(void *bones_pointer) {
27
+ int *bones_pointer2 = (int *)bones_pointer - 1;
28
+ bones_pointer = (char *)bones_pointer - *bones_pointer2;
29
+ free(bones_pointer);
30
+ }
31
+
@@ -0,0 +1,5 @@
1
+
2
+ // Perform a zero-copy of <array> from device to host
3
+ void* bones_pointer_to_<array> = clEnqueueMapBuffer(bones_queue,device_<array>,CL_TRUE,CL_MAP_READ,<offset>,<variable_dimensions>*sizeof(<type>),0,NULL,NULL,&bones_errors); error_check(bones_errors);
4
+ clEnqueueUnmapMemObject(bones_queue,device_<array>,bones_pointer_to_<array>,0,NULL,NULL);
5
+ clFinish(bones_queue);
@@ -0,0 +1,3 @@
1
+
2
+ //bones_errors = clEnqueueWriteBuffer(bones_queue, device_<array>, CL_TRUE, 0, <variable_dimensions>*sizeof(<type>), <array><flatten>, 0, NULL, NULL); error_check(bones_errors);
3
+ //clFinish(bones_queue);
@@ -0,0 +1,3 @@
1
+
2
+ // Clean up GPU arrays
3
+ clReleaseMemObject(device_<array>);
@@ -0,0 +1,4 @@
1
+
2
+ // Create a device pointer for <array> (zero-copy)
3
+ cl_mem device_<array> = clCreateBuffer(bones_context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, <variable_dimensions>*sizeof(<type>), <array><flatten>, &bones_errors); error_check(bones_errors);
4
+ //cl_mem device_<array> = clCreateBuffer(bones_context, CL_MEM_READ_WRITE, <variable_dimensions>*sizeof(<type>), NULL, &bones_errors); error_check(bones_errors);
@@ -0,0 +1,24 @@
1
+ fflush(stdout);
2
+ cl_int bones_errors;
3
+ cl_event bones_event;
4
+
5
+ // Only compile if this program is different from the last one
6
+ if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
7
+ strcpy(bones_last_program,"<algorithm_filename>");
8
+
9
+ // Load and compile the kernel
10
+ char *bones_source = get_source("<algorithm_filename>_device.cl");
11
+ bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
12
+ bones_errors = clBuildProgram(bones_program,0,NULL,"",NULL,NULL);
13
+
14
+ // Get and print the compiler log
15
+ char* bones_log;
16
+ size_t bones_log_size;
17
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
18
+ bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
19
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
20
+ bones_log[bones_log_size] = '\0';
21
+ if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
22
+ free(bones_log);
23
+ error_check(bones_errors);
24
+ }
@@ -0,0 +1,5 @@
1
+
2
+ // Start the timer for the measurement of the kernel and memory copy execution time
3
+ struct timeval bones_start_time1;
4
+ clFinish(bones_queue);
5
+ gettimeofday(&bones_start_time1, NULL);
@@ -0,0 +1,9 @@
1
+
2
+ // End the timer for the measurement of the kernel and memory copy execution time
3
+ #if (ITERS == 1)
4
+ clFinish(bones_queue);
5
+ struct timeval bones_end_time1;
6
+ gettimeofday(&bones_end_time1, NULL);
7
+ float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
9
+ #endif
@@ -0,0 +1,16 @@
1
+
2
+ // Start the timer for the measurement of the kernel execution time
3
+ clFinish(bones_queue);
4
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
5
+
6
+ // Flush the CPU cache (for measurement purposes only)
7
+ const int bones_flush_size = 4*1024*1024; // (16MB)
8
+ int bones_flush_i;
9
+ int bones_flush_j;
10
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
11
+ for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
12
+ for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
13
+ bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
14
+ }
15
+ }
16
+ free(bones_flush_c);
@@ -0,0 +1,11 @@
1
+
2
+ }
3
+
4
+ // Stop the timer for the measurement of the kernel execution time
5
+ clFinish(bones_queue);
6
+ cl_ulong end2, start2;
7
+ bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
8
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
9
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
10
+ float bones_timer2 = 0.000001 * (end2-start2);
11
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,67 @@
1
+
2
+ // Store the initial value
3
+ cl_mem bones_initial_value = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(<out0_type>),<out0_name>,&bones_errors); error_check(bones_errors);
4
+
5
+ // Create the kernels
6
+ cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
7
+ cl_kernel bones_kernel_<algorithm_name>_1 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_1", &bones_errors); error_check(bones_errors);
8
+ cl_kernel bones_kernel_<algorithm_name>_2 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_2", &bones_errors); error_check(bones_errors);
9
+
10
+ // Run either one kernel or multiple kernels
11
+ if (<in0_dimensions> <= 512) {
12
+
13
+ // Set all the arguments to the kernel function
14
+ int bones_num_args = 3;
15
+ int bones_dimensions = <in0_dimensions>;
16
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
17
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
18
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
19
+ <kernel_argument_list_constants>
20
+ // Start only one kernel
21
+ const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
22
+ size_t bones_local_worksize1[] = {bones_num_threads};
23
+ size_t bones_global_worksize1[] = {bones_num_threads};
24
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
25
+
26
+ }
27
+ else {
28
+
29
+ // Allocate space for an intermediate array
30
+ cl_mem bones_device_temp = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,128*sizeof(<out0_type>),NULL,&bones_errors); error_check(bones_errors);
31
+
32
+ // Set all the arguments to the kernel function
33
+ int bones_num_args = 3;
34
+ int bones_dimensions = <in0_dimensions>;
35
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
36
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
37
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(bones_device_temp),(void*)&bones_device_temp);
38
+ <kernel_argument_list_constants>
39
+ // Start the first kernel
40
+ size_t bones_local_worksize1[] = {256};
41
+ size_t bones_global_worksize1[] = {256*128};
42
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
43
+
44
+ // Set all the arguments to the kernel function
45
+ clSetKernelArg(bones_kernel_<algorithm_name>_1,0,sizeof(bones_device_temp),(void*)&bones_device_temp);
46
+ clSetKernelArg(bones_kernel_<algorithm_name>_1,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
47
+ // Start the second kernel
48
+ size_t bones_local_worksize2[] = {128};
49
+ size_t bones_global_worksize2[] = {128};
50
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_1,1,NULL,bones_global_worksize2,bones_local_worksize2,0,NULL,&bones_event); error_check(bones_errors);
51
+ clReleaseMemObject(bones_device_temp);
52
+ }
53
+
54
+ // Set all the arguments to the kernel function
55
+ clSetKernelArg(bones_kernel_<algorithm_name>_2,0,sizeof(bones_initial_value),(void*)&bones_initial_value);
56
+ clSetKernelArg(bones_kernel_<algorithm_name>_2,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
57
+ // Perform the last computation (only needed if there is an initial value)
58
+ size_t bones_local_worksize3[] = {1};
59
+ size_t bones_global_worksize3[] = {1};
60
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_2,1,NULL,bones_global_worksize3,bones_local_worksize3,0,NULL,&bones_event); error_check(bones_errors);
61
+ clReleaseMemObject(bones_initial_value);
62
+
63
+ // Synchronize and clean-up the kernels
64
+ clFinish(bones_queue);
65
+ clReleaseKernel(bones_kernel_<algorithm_name>_0);
66
+ clReleaseKernel(bones_kernel_<algorithm_name>_1);
67
+ clReleaseKernel(bones_kernel_<algorithm_name>_2);