bones-compiler 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,37 @@
1
+ module C
2
+ # This class provides an extention to the CAST type class. It
3
+ # contains a number of functions applicable to types such as
4
+ # pointers, arrays, structures, floats, integers, etc.
5
+ #
6
+ # The provided methods are just helpers to extend the CAST
7
+ # functionality and to clean-up the Bones classes.
8
+ class Type
9
+
10
+ # This method is used to determine whether the variable is
11
+ # an array and/or a pointer. Returns either true or false.
12
+ def array_or_pointer?
13
+ ((self.class == C::Array) || (self.class == C::Pointer))
14
+ end
15
+
16
+ # This method recursively searches for the type of a variable.
17
+ # Recursion is needed when a type is an array or a pointer.
18
+ # The method eventually returns one of the CAST algorithm
19
+ # types being either: void, int, float, char, bool, complex
20
+ # or imaginary.
21
+ def type_name
22
+ (self.array_or_pointer?) ? self.type.type_name : self
23
+ end
24
+
25
+ # This method returns the variable's dimension as an integer.
26
+ # it uses recursion in case the type is an array or a pointer.
27
+ # Types that are neither arrays nor pointers have a dimension
28
+ # of zero. For arrays and pointers, each '*' or '[]' contributes
29
+ # to one additional dimension.
30
+ def dimensions(count=0)
31
+ (self.array_or_pointer?) ? self.type.dimensions(count+1) : count
32
+ end
33
+
34
+ end
35
+
36
+ end
37
+
File without changes
@@ -0,0 +1,17 @@
1
+
2
+ // Multiple iterations for measurements
3
+ #define ITERS 1
4
+
5
+ // Declaration of the original function
6
+ int bones_main(void);
7
+
8
+ // New main function for initialisation and clean-up
9
+ int main(void) {
10
+
11
+ // Original main function
12
+ int bones_return = bones_main();
13
+
14
+ // Clean-up
15
+ return bones_return;
16
+ }
17
+
@@ -0,0 +1 @@
1
+ #include <math.h>
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,3 @@
1
+
2
+ // Create a pointer to <array> on the device
3
+ <type>* device_<array> = <array><flatten>;
File without changes
File without changes
File without changes
@@ -0,0 +1,20 @@
1
+ // Initialize the timer
2
+ float bones_timer2 = 0;
3
+ struct timeval bones_start_time2;
4
+ struct timeval bones_end_time2;
5
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
6
+
7
+ // Flush the CPU cache (for measurement purposes only)
8
+ const int bones_flush_size = 4*1024*1024; // (16MB)
9
+ int bones_flush_i;
10
+ int bones_flush_j;
11
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
12
+ for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
13
+ for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
14
+ bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
15
+ }
16
+ }
17
+ free(bones_flush_c);
18
+
19
+ // Start the timer for the measurement of the kernel execution time
20
+ gettimeofday(&bones_start_time2, NULL);
@@ -0,0 +1,8 @@
1
+
2
+ // Stop the timer for the measurement of the kernel execution time
3
+ gettimeofday(&bones_end_time2, NULL);
4
+ bones_timer2 += 0.001 * (1000000*(bones_end_time2.tv_sec-bones_start_time2.tv_sec)+bones_end_time2.tv_usec-bones_start_time2.tv_usec);
5
+ }
6
+
7
+ // Print the measurement data
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2/((float)ITERS));
@@ -0,0 +1,3 @@
1
+
2
+ // Start the kernel
3
+ bones_kernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,15 @@
1
+ /* STARTDEF
2
+ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel
5
+ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
6
+ for(int bones_global_id=0; bones_global_id<<parallelism>; bones_global_id++) {
7
+
8
+ // Calculate the global ID(s) based on the thread id
9
+ <ids>
10
+
11
+ // Perform the main computation
12
+ <algorithm_code1>
13
+ }
14
+ }
15
+
@@ -0,0 +1,24 @@
1
+ ###################################################################
2
+ # Each line holds one mapping from species to skeleton
3
+ # The ordering is always ['chunk','neighbourhood','element','shared','void']
4
+ # The pattern 'full' is omitted from matching (will thus always match)
5
+ # 'D' denotes any ranges (e.g. D|element can be any dimension)
6
+ # 'N' denotes any range (e.g. N,N|element must be 2D)
7
+ # '+' denotes one or more of these patterns
8
+ ###################################################################
9
+ D|chunk(D)+ -> D|chunk(D)+ :default :00
10
+ D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
11
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
12
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
13
+ D|chunk(D)+ -> D|element+ :default :00
14
+ D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
15
+ D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
16
+ N|neighbourhood(N)+ -> N|element+ :default :00
17
+ D|neighbourhood(D)+ -> D|element+ :default :00
18
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
19
+ D|element+ -> D|chunk(D)+ :default :00
20
+ D|element+ -> D|element+ :default :00
21
+ D|element -> 1|shared :default :00
22
+ D|element+ -> D|shared+ :default :00
23
+ D|element+ -> D|element+ ^ D|shared+ :default :00
24
+ D|void -> D|element+ :default :00
@@ -0,0 +1,6 @@
1
+
2
+ // Clean-up the OpenCL context
3
+ //clReleaseCommandQueue(bones_queue);
4
+ //clReleaseProgram(bones_program);
5
+ //clReleaseContext(bones_context);
6
+ fflush(stdout);
@@ -0,0 +1,155 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <math.h>
5
+ #include <sys/time.h>
6
+ #include <CL/cl.h>
7
+
8
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
9
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
10
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
11
+ #define DIV_FLOOR(a,b) (a/b)
12
+
13
+ // Multiple iterations for kernel measurements
14
+ #define ITERS 1
15
+
16
+ // Load the OpenCL kernel from file
17
+ char * get_source(const char* bones_filename) {
18
+ FILE* bones_fp = fopen(bones_filename,"r");
19
+ fseek(bones_fp,0,SEEK_END);
20
+ long bones_size = ftell(bones_fp);
21
+ rewind(bones_fp);
22
+ char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
23
+ int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
24
+ bones_source[bones_size] = '\0';
25
+ fclose(bones_fp);
26
+ return bones_source;
27
+ }
28
+
29
+ // Print an error if it occurs
30
+ void error_check(cl_int bones_errors) {
31
+ if(bones_errors != CL_SUCCESS) {
32
+ switch (bones_errors) {
33
+ case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
34
+ case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
35
+ case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
36
+ case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
37
+ case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
38
+ case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
39
+ case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
40
+ case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
41
+ case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
42
+ case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
43
+ case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
44
+ case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
45
+ case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
46
+ case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
47
+ case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
48
+ case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
49
+ case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
50
+ case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
51
+ case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
52
+ case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
53
+ case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
54
+ case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
55
+ case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
56
+ case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
57
+ case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
58
+ case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
59
+ case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
60
+ case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
61
+ case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
62
+ case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
63
+ case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
64
+ case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
65
+ case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
66
+ case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
67
+ case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
68
+ case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
69
+ case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
70
+ case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
71
+ case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
72
+ case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
73
+ case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
74
+ case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
75
+ case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
76
+ case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
77
+ case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
78
+ default: printf("--- Error: Unknown with code %d\n", bones_errors);
79
+ }
80
+ fflush(stdout); exit(0);
81
+ }
82
+ }
83
+
84
+ // Use a global variable for the device ID, context and command queue
85
+ cl_device_id bones_device;
86
+ cl_context bones_context;
87
+ cl_command_queue bones_queue;
88
+
89
+ // Use a global variable to store the name and the binary for the last program
90
+ char bones_last_program[1024];
91
+ cl_program bones_program;
92
+
93
+ // Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
94
+ void bones_initialize_target(void) {
95
+ cl_int bones_errors;
96
+
97
+ // Get OpenCL platform count
98
+ cl_uint bones_num_platforms;
99
+ bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
100
+ if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
101
+
102
+ // Get all OpenCL platform IDs
103
+ cl_platform_id bones_platform_ids[10];
104
+ bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
105
+
106
+ // Select the AMD APP platform
107
+ char bones_buffer[1024];
108
+ cl_uint bones_platform;
109
+ for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
110
+ clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
111
+ if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
112
+ }
113
+
114
+ // Get a CPU device on the platform
115
+ bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_CPU, 1, &bones_device, NULL); error_check(bones_errors);
116
+ bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
117
+
118
+ // Create a context
119
+ bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
120
+
121
+ // Create a command queue
122
+ bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
123
+
124
+ // Create space on the device
125
+ cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
126
+
127
+ // Copy something to the device
128
+ bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
129
+
130
+ // Clean-up the OpenCL context
131
+ strcpy(bones_last_program,"");
132
+ clReleaseMemObject(bones_device_data);
133
+ clReleaseContext(bones_context);
134
+ fflush(stdout);
135
+ }
136
+
137
+ // Declaration of the original function
138
+ int bones_main(void);
139
+
140
+ // New main function for initialisation and clean-up
141
+ int main(void) {
142
+
143
+ // Initialisation
144
+ bones_initialize_target();
145
+
146
+ // Original main function
147
+ int bones_return = bones_main();
148
+
149
+ // Clean-up
150
+ clReleaseCommandQueue(bones_queue);
151
+ clReleaseProgram(bones_program);
152
+ clReleaseContext(bones_context);
153
+ return bones_return;
154
+ }
155
+
@@ -0,0 +1,4 @@
1
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
2
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
3
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
4
+ #define DIV_FLOOR(a,b) (a/b)
File without changes
@@ -0,0 +1,8 @@
1
+
2
+ // Perform a zero-copy of <array> from device to host
3
+ //void* bones_pointer_to_<array> = clEnqueueMapBuffer(bones_queue,device_<array>,CL_TRUE,CL_MAP_READ,<offset>,<variable_dimensions>*sizeof(<type>),0,NULL,NULL,&bones_errors); error_check(bones_errors);
4
+ //clEnqueueUnmapMemObject(bones_queue,device_<array>,bones_pointer_to_<array>,0,NULL,NULL);
5
+
6
+ // Perform a copy of <array> from device to host
7
+ clEnqueueReadBuffer(bones_queue,device_<array>,CL_TRUE,(<offset>)*sizeof(<type>),<variable_dimensions>*sizeof(<type>),<array><flatten>+<offset>,0,NULL,NULL);
8
+ clFinish(bones_queue);
@@ -0,0 +1,4 @@
1
+
2
+ // Copy <array> to the device
3
+ device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,NULL);
4
+ clFinish(bones_queue);
@@ -0,0 +1,3 @@
1
+
2
+ // Clean up GPU arrays
3
+ clReleaseMemObject(device_<array>);
@@ -0,0 +1,6 @@
1
+
2
+ // Create a device pointer for <array> (zero-copy)
3
+ //cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,&bones_errors); error_check(bones_errors);
4
+
5
+ // Create a device pointer for <array>
6
+ cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,<variable_dimensions>*sizeof(<type>),NULL,&bones_errors); error_check(bones_errors);
@@ -0,0 +1,24 @@
1
+ fflush(stdout);
2
+ cl_int bones_errors;
3
+ cl_event bones_event;
4
+
5
+ // Only compile if this program is different from the last one
6
+ if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
7
+ strcpy(bones_last_program,"<algorithm_filename>");
8
+
9
+ // Load and compile the kernel
10
+ char *bones_source = get_source("<algorithm_filename>_device.cl");
11
+ bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
12
+ bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
13
+
14
+ // Get and print the compiler log
15
+ char* bones_log;
16
+ size_t bones_log_size;
17
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
18
+ bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
19
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
20
+ bones_log[bones_log_size] = '\0';
21
+ //if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
22
+ free(bones_log);
23
+ error_check(bones_errors);
24
+ }
@@ -0,0 +1,5 @@
1
+
2
+ // Start the timer for the measurement of the kernel and memory copy execution time
3
+ struct timeval bones_start_time1;
4
+ clFinish(bones_queue);
5
+ gettimeofday(&bones_start_time1, NULL);
@@ -0,0 +1,9 @@
1
+
2
+ // End the timer for the measurement of the kernel and memory copy execution time
3
+ #if (ITERS == 1)
4
+ clFinish(bones_queue);
5
+ struct timeval bones_end_time1;
6
+ gettimeofday(&bones_end_time1, NULL);
7
+ float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
9
+ #endif
@@ -0,0 +1,16 @@
1
+
2
+ // Start the timer for the measurement of the kernel execution time
3
+ clFinish(bones_queue);
4
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
5
+
6
+ // Flush the CPU cache (for measurement purposes only)
7
+ const int bones_flush_size = 4*1024*1024; // (16MB)
8
+ int bones_flush_i;
9
+ int bones_flush_j;
10
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
11
+ for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
12
+ for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
13
+ bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
14
+ }
15
+ }
16
+ free(bones_flush_c);
@@ -0,0 +1,11 @@
1
+
2
+ }
3
+
4
+ // Stop the timer for the measurement of the kernel execution time
5
+ clFinish(bones_queue);
6
+ cl_ulong end2, start2;
7
+ bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
8
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
9
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
10
+ float bones_timer2 = 0.000001 * (end2-start2);
11
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,67 @@
1
+
2
+ // Store the initial value
3
+ cl_mem bones_initial_value = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(<out0_type>),<out0_name>,&bones_errors); error_check(bones_errors);
4
+
5
+ // Create the kernels
6
+ cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
7
+ cl_kernel bones_kernel_<algorithm_name>_1 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_1", &bones_errors); error_check(bones_errors);
8
+ cl_kernel bones_kernel_<algorithm_name>_2 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_2", &bones_errors); error_check(bones_errors);
9
+
10
+ // Run either one kernel or multiple kernels
11
+ if (<in0_dimensions> <= 512) {
12
+
13
+ // Set all the arguments to the kernel function
14
+ int bones_num_args = 3;
15
+ int bones_dimensions = <in0_dimensions>;
16
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
17
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
18
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
19
+ <kernel_argument_list_constants>
20
+ // Start only one kernel
21
+ const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
22
+ size_t bones_local_worksize1[] = {bones_num_threads};
23
+ size_t bones_global_worksize1[] = {bones_num_threads};
24
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
25
+
26
+ }
27
+ else {
28
+
29
+ // Allocate space for an intermediate array
30
+ cl_mem bones_device_temp = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,128*sizeof(<out0_type>),NULL,&bones_errors); error_check(bones_errors);
31
+
32
+ // Set all the arguments to the kernel function
33
+ int bones_num_args = 3;
34
+ int bones_dimensions = <in0_dimensions>;
35
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
36
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
37
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(bones_device_temp),(void*)&bones_device_temp);
38
+ <kernel_argument_list_constants>
39
+ // Start the first kernel
40
+ size_t bones_local_worksize1[] = {256};
41
+ size_t bones_global_worksize1[] = {256*128};
42
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
43
+
44
+ // Set all the arguments to the kernel function
45
+ clSetKernelArg(bones_kernel_<algorithm_name>_1,0,sizeof(bones_device_temp),(void*)&bones_device_temp);
46
+ clSetKernelArg(bones_kernel_<algorithm_name>_1,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
47
+ // Start the second kernel
48
+ size_t bones_local_worksize2[] = {128};
49
+ size_t bones_global_worksize2[] = {128};
50
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_1,1,NULL,bones_global_worksize2,bones_local_worksize2,0,NULL,&bones_event); error_check(bones_errors);
51
+ clReleaseMemObject(bones_device_temp);
52
+ }
53
+
54
+ // Set all the arguments to the kernel function
55
+ clSetKernelArg(bones_kernel_<algorithm_name>_2,0,sizeof(bones_initial_value),(void*)&bones_initial_value);
56
+ clSetKernelArg(bones_kernel_<algorithm_name>_2,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
57
+ // Perform the last computation (only needed if there is an initial value)
58
+ size_t bones_local_worksize3[] = {1};
59
+ size_t bones_global_worksize3[] = {1};
60
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_2,1,NULL,bones_global_worksize3,bones_local_worksize3,0,NULL,&bones_event); error_check(bones_errors);
61
+ clReleaseMemObject(bones_initial_value);
62
+
63
+ // Synchronize and clean-up the kernels
64
+ clFinish(bones_queue);
65
+ clReleaseKernel(bones_kernel_<algorithm_name>_0);
66
+ clReleaseKernel(bones_kernel_<algorithm_name>_1);
67
+ clReleaseKernel(bones_kernel_<algorithm_name>_2);