bones-compiler 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
@@ -0,0 +1,37 @@
1
+ module C
2
+ # This class provides an extention to the CAST type class. It
3
+ # contains a number of functions applicable to types such as
4
+ # pointers, arrays, structures, floats, integers, etc.
5
+ #
6
+ # The provided methods are just helpers to extend the CAST
7
+ # functionality and to clean-up the Bones classes.
8
+ class Type
9
+
10
+ # This method is used to determine whether the variable is
11
+ # an array and/or a pointer. Returns either true or false.
12
+ def array_or_pointer?
13
+ ((self.class == C::Array) || (self.class == C::Pointer))
14
+ end
15
+
16
+ # This method recursively searches for the type of a variable.
17
+ # Recursion is needed when a type is an array or a pointer.
18
+ # The method eventually returns one of the CAST algorithm
19
+ # types being either: void, int, float, char, bool, complex
20
+ # or imaginary.
21
+ def type_name
22
+ (self.array_or_pointer?) ? self.type.type_name : self
23
+ end
24
+
25
+ # This method returns the variable's dimension as an integer.
26
+ # it uses recursion in case the type is an array or a pointer.
27
+ # Types that are neither arrays nor pointers have a dimension
28
+ # of zero. For arrays and pointers, each '*' or '[]' contributes
29
+ # to one additional dimension.
30
+ def dimensions(count=0)
31
+ (self.array_or_pointer?) ? self.type.dimensions(count+1) : count
32
+ end
33
+
34
+ end
35
+
36
+ end
37
+
File without changes
@@ -0,0 +1,17 @@
1
+
2
+ // Multiple iterations for measurements
3
+ #define ITERS 1
4
+
5
+ // Declaration of the original function
6
+ int bones_main(void);
7
+
8
+ // New main function for initialisation and clean-up
9
+ int main(void) {
10
+
11
+ // Original main function
12
+ int bones_return = bones_main();
13
+
14
+ // Clean-up
15
+ return bones_return;
16
+ }
17
+
@@ -0,0 +1 @@
1
+ #include <math.h>
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,3 @@
1
+
2
+ // Create a pointer to <array> on the device
3
+ <type>* device_<array> = <array><flatten>;
File without changes
File without changes
File without changes
@@ -0,0 +1,20 @@
1
+ // Initialize the timer
2
+ float bones_timer2 = 0;
3
+ struct timeval bones_start_time2;
4
+ struct timeval bones_end_time2;
5
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
6
+
7
+ // Flush the CPU cache (for measurement purposes only)
8
+ const int bones_flush_size = 4*1024*1024; // (16MB)
9
+ int bones_flush_i;
10
+ int bones_flush_j;
11
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
12
+ for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
13
+ for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
14
+ bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
15
+ }
16
+ }
17
+ free(bones_flush_c);
18
+
19
+ // Start the timer for the measurement of the kernel execution time
20
+ gettimeofday(&bones_start_time2, NULL);
@@ -0,0 +1,8 @@
1
+
2
+ // Stop the timer for the measurement of the kernel execution time
3
+ gettimeofday(&bones_end_time2, NULL);
4
+ bones_timer2 += 0.001 * (1000000*(bones_end_time2.tv_sec-bones_start_time2.tv_sec)+bones_end_time2.tv_usec-bones_start_time2.tv_usec);
5
+ }
6
+
7
+ // Print the measurement data
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2/((float)ITERS));
@@ -0,0 +1,3 @@
1
+
2
+ // Start the kernel
3
+ bones_kernel_<algorithm_name>_0(<devicenames>, <argument_name>);
@@ -0,0 +1,15 @@
1
+ /* STARTDEF
2
+ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
3
+ ENDDEF */
4
+ // Start of the <algorithm_name> kernel
5
+ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
6
+ for(int bones_global_id=0; bones_global_id<<parallelism>; bones_global_id++) {
7
+
8
+ // Calculate the global ID(s) based on the thread id
9
+ <ids>
10
+
11
+ // Perform the main computation
12
+ <algorithm_code1>
13
+ }
14
+ }
15
+
@@ -0,0 +1,24 @@
1
+ ###################################################################
2
+ # Each line holds one mapping from species to skeleton
3
+ # The ordering is always ['chunk','neighbourhood','element','shared','void']
4
+ # The pattern 'full' is omitted from matching (will thus always match)
5
+ # 'D' denotes any ranges (e.g. D|element can be any dimension)
6
+ # 'N' denotes any range (e.g. N,N|element must be 2D)
7
+ # '+' denotes one or more of these patterns
8
+ ###################################################################
9
+ D|chunk(D)+ -> D|chunk(D)+ :default :00
10
+ D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
11
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
12
+ D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
13
+ D|chunk(D)+ -> D|element+ :default :00
14
+ D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
15
+ D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
16
+ N|neighbourhood(N)+ -> N|element+ :default :00
17
+ D|neighbourhood(D)+ -> D|element+ :default :00
18
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
19
+ D|element+ -> D|chunk(D)+ :default :00
20
+ D|element+ -> D|element+ :default :00
21
+ D|element -> 1|shared :default :00
22
+ D|element+ -> D|shared+ :default :00
23
+ D|element+ -> D|element+ ^ D|shared+ :default :00
24
+ D|void -> D|element+ :default :00
@@ -0,0 +1,6 @@
1
+
2
+ // Clean-up the OpenCL context
3
+ //clReleaseCommandQueue(bones_queue);
4
+ //clReleaseProgram(bones_program);
5
+ //clReleaseContext(bones_context);
6
+ fflush(stdout);
@@ -0,0 +1,155 @@
1
+ #include <string.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <math.h>
5
+ #include <sys/time.h>
6
+ #include <CL/cl.h>
7
+
8
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
9
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
10
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
11
+ #define DIV_FLOOR(a,b) (a/b)
12
+
13
+ // Multiple iterations for kernel measurements
14
+ #define ITERS 1
15
+
16
+ // Load the OpenCL kernel from file
17
+ char * get_source(const char* bones_filename) {
18
+ FILE* bones_fp = fopen(bones_filename,"r");
19
+ fseek(bones_fp,0,SEEK_END);
20
+ long bones_size = ftell(bones_fp);
21
+ rewind(bones_fp);
22
+ char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
23
+ int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
24
+ bones_source[bones_size] = '\0';
25
+ fclose(bones_fp);
26
+ return bones_source;
27
+ }
28
+
29
+ // Print an error if it occurs
30
+ void error_check(cl_int bones_errors) {
31
+ if(bones_errors != CL_SUCCESS) {
32
+ switch (bones_errors) {
33
+ case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
34
+ case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
35
+ case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
36
+ case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
37
+ case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
38
+ case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
39
+ case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
40
+ case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
41
+ case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
42
+ case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
43
+ case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
44
+ case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
45
+ case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
46
+ case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
47
+ case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
48
+ case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
49
+ case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
50
+ case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
51
+ case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
52
+ case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
53
+ case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
54
+ case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
55
+ case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
56
+ case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
57
+ case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
58
+ case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
59
+ case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
60
+ case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
61
+ case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
62
+ case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
63
+ case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
64
+ case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
65
+ case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
66
+ case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
67
+ case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
68
+ case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
69
+ case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
70
+ case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
71
+ case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
72
+ case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
73
+ case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
74
+ case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
75
+ case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
76
+ case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
77
+ case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
78
+ default: printf("--- Error: Unknown with code %d\n", bones_errors);
79
+ }
80
+ fflush(stdout); exit(0);
81
+ }
82
+ }
83
+
84
+ // Use a global variable for the device ID, context and command queue
85
+ cl_device_id bones_device;
86
+ cl_context bones_context;
87
+ cl_command_queue bones_queue;
88
+
89
+ // Use a global variable to store the name and the binary for the last program
90
+ char bones_last_program[1024];
91
+ cl_program bones_program;
92
+
93
+ // Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
94
+ void bones_initialize_target(void) {
95
+ cl_int bones_errors;
96
+
97
+ // Get OpenCL platform count
98
+ cl_uint bones_num_platforms;
99
+ bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
100
+ if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
101
+
102
+ // Get all OpenCL platform IDs
103
+ cl_platform_id bones_platform_ids[10];
104
+ bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
105
+
106
+ // Select the AMD APP platform
107
+ char bones_buffer[1024];
108
+ cl_uint bones_platform;
109
+ for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
110
+ clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
111
+ if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
112
+ }
113
+
114
+ // Get a CPU device on the platform
115
+ bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_CPU, 1, &bones_device, NULL); error_check(bones_errors);
116
+ bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
117
+
118
+ // Create a context
119
+ bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
120
+
121
+ // Create a command queue
122
+ bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
123
+
124
+ // Create space on the device
125
+ cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
126
+
127
+ // Copy something to the device
128
+ bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
129
+
130
+ // Clean-up the OpenCL context
131
+ strcpy(bones_last_program,"");
132
+ clReleaseMemObject(bones_device_data);
133
+ clReleaseContext(bones_context);
134
+ fflush(stdout);
135
+ }
136
+
137
+ // Declaration of the original function
138
+ int bones_main(void);
139
+
140
+ // New main function for initialisation and clean-up
141
+ int main(void) {
142
+
143
+ // Initialisation
144
+ bones_initialize_target();
145
+
146
+ // Original main function
147
+ int bones_return = bones_main();
148
+
149
+ // Clean-up
150
+ clReleaseCommandQueue(bones_queue);
151
+ clReleaseProgram(bones_program);
152
+ clReleaseContext(bones_context);
153
+ return bones_return;
154
+ }
155
+
@@ -0,0 +1,4 @@
1
+ #define BONES_MIN(a,b) ((a<b) ? a : b)
2
+ #define BONES_MAX(a,b) ((a>b) ? a : b)
3
+ #define DIV_CEIL(a,b) ((a+b-1)/b)
4
+ #define DIV_FLOOR(a,b) (a/b)
File without changes
@@ -0,0 +1,8 @@
1
+
2
+ // Perform a zero-copy of <array> from device to host
3
+ //void* bones_pointer_to_<array> = clEnqueueMapBuffer(bones_queue,device_<array>,CL_TRUE,CL_MAP_READ,<offset>,<variable_dimensions>*sizeof(<type>),0,NULL,NULL,&bones_errors); error_check(bones_errors);
4
+ //clEnqueueUnmapMemObject(bones_queue,device_<array>,bones_pointer_to_<array>,0,NULL,NULL);
5
+
6
+ // Perform a copy of <array> from device to host
7
+ clEnqueueReadBuffer(bones_queue,device_<array>,CL_TRUE,(<offset>)*sizeof(<type>),<variable_dimensions>*sizeof(<type>),<array><flatten>+<offset>,0,NULL,NULL);
8
+ clFinish(bones_queue);
@@ -0,0 +1,4 @@
1
+
2
+ // Copy <array> to the device
3
+ device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,NULL);
4
+ clFinish(bones_queue);
@@ -0,0 +1,3 @@
1
+
2
+ // Clean up GPU arrays
3
+ clReleaseMemObject(device_<array>);
@@ -0,0 +1,6 @@
1
+
2
+ // Create a device pointer for <array> (zero-copy)
3
+ //cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,&bones_errors); error_check(bones_errors);
4
+
5
+ // Create a device pointer for <array>
6
+ cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,<variable_dimensions>*sizeof(<type>),NULL,&bones_errors); error_check(bones_errors);
@@ -0,0 +1,24 @@
1
+ fflush(stdout);
2
+ cl_int bones_errors;
3
+ cl_event bones_event;
4
+
5
+ // Only compile if this program is different from the last one
6
+ if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
7
+ strcpy(bones_last_program,"<algorithm_filename>");
8
+
9
+ // Load and compile the kernel
10
+ char *bones_source = get_source("<algorithm_filename>_device.cl");
11
+ bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
12
+ bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
13
+
14
+ // Get and print the compiler log
15
+ char* bones_log;
16
+ size_t bones_log_size;
17
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
18
+ bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
19
+ clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
20
+ bones_log[bones_log_size] = '\0';
21
+ //if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
22
+ free(bones_log);
23
+ error_check(bones_errors);
24
+ }
@@ -0,0 +1,5 @@
1
+
2
+ // Start the timer for the measurement of the kernel and memory copy execution time
3
+ struct timeval bones_start_time1;
4
+ clFinish(bones_queue);
5
+ gettimeofday(&bones_start_time1, NULL);
@@ -0,0 +1,9 @@
1
+
2
+ // End the timer for the measurement of the kernel and memory copy execution time
3
+ #if (ITERS == 1)
4
+ clFinish(bones_queue);
5
+ struct timeval bones_end_time1;
6
+ gettimeofday(&bones_end_time1, NULL);
7
+ float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
8
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
9
+ #endif
@@ -0,0 +1,16 @@
1
+
2
+ // Start the timer for the measurement of the kernel execution time
3
+ clFinish(bones_queue);
4
+ for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
5
+
6
+ // Flush the CPU cache (for measurement purposes only)
7
+ const int bones_flush_size = 4*1024*1024; // (16MB)
8
+ int bones_flush_i;
9
+ int bones_flush_j;
10
+ char *bones_flush_c = (char *)malloc(bones_flush_size);
11
+ for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
12
+ for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
13
+ bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
14
+ }
15
+ }
16
+ free(bones_flush_c);
@@ -0,0 +1,11 @@
1
+
2
+ }
3
+
4
+ // Stop the timer for the measurement of the kernel execution time
5
+ clFinish(bones_queue);
6
+ cl_ulong end2, start2;
7
+ bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
8
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
9
+ bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
10
+ float bones_timer2 = 0.000001 * (end2-start2);
11
+ printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,67 @@
1
+
2
+ // Store the initial value
3
+ cl_mem bones_initial_value = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(<out0_type>),<out0_name>,&bones_errors); error_check(bones_errors);
4
+
5
+ // Create the kernels
6
+ cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
7
+ cl_kernel bones_kernel_<algorithm_name>_1 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_1", &bones_errors); error_check(bones_errors);
8
+ cl_kernel bones_kernel_<algorithm_name>_2 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_2", &bones_errors); error_check(bones_errors);
9
+
10
+ // Run either one kernel or multiple kernels
11
+ if (<in0_dimensions> <= 512) {
12
+
13
+ // Set all the arguments to the kernel function
14
+ int bones_num_args = 3;
15
+ int bones_dimensions = <in0_dimensions>;
16
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
17
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
18
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
19
+ <kernel_argument_list_constants>
20
+ // Start only one kernel
21
+ const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
22
+ size_t bones_local_worksize1[] = {bones_num_threads};
23
+ size_t bones_global_worksize1[] = {bones_num_threads};
24
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
25
+
26
+ }
27
+ else {
28
+
29
+ // Allocate space for an intermediate array
30
+ cl_mem bones_device_temp = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,128*sizeof(<out0_type>),NULL,&bones_errors); error_check(bones_errors);
31
+
32
+ // Set all the arguments to the kernel function
33
+ int bones_num_args = 3;
34
+ int bones_dimensions = <in0_dimensions>;
35
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
36
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
37
+ clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(bones_device_temp),(void*)&bones_device_temp);
38
+ <kernel_argument_list_constants>
39
+ // Start the first kernel
40
+ size_t bones_local_worksize1[] = {256};
41
+ size_t bones_global_worksize1[] = {256*128};
42
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
43
+
44
+ // Set all the arguments to the kernel function
45
+ clSetKernelArg(bones_kernel_<algorithm_name>_1,0,sizeof(bones_device_temp),(void*)&bones_device_temp);
46
+ clSetKernelArg(bones_kernel_<algorithm_name>_1,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
47
+ // Start the second kernel
48
+ size_t bones_local_worksize2[] = {128};
49
+ size_t bones_global_worksize2[] = {128};
50
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_1,1,NULL,bones_global_worksize2,bones_local_worksize2,0,NULL,&bones_event); error_check(bones_errors);
51
+ clReleaseMemObject(bones_device_temp);
52
+ }
53
+
54
+ // Set all the arguments to the kernel function
55
+ clSetKernelArg(bones_kernel_<algorithm_name>_2,0,sizeof(bones_initial_value),(void*)&bones_initial_value);
56
+ clSetKernelArg(bones_kernel_<algorithm_name>_2,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
57
+ // Perform the last computation (only needed if there is an initial value)
58
+ size_t bones_local_worksize3[] = {1};
59
+ size_t bones_global_worksize3[] = {1};
60
+ bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_2,1,NULL,bones_global_worksize3,bones_local_worksize3,0,NULL,&bones_event); error_check(bones_errors);
61
+ clReleaseMemObject(bones_initial_value);
62
+
63
+ // Synchronize and clean-up the kernels
64
+ clFinish(bones_queue);
65
+ clReleaseKernel(bones_kernel_<algorithm_name>_0);
66
+ clReleaseKernel(bones_kernel_<algorithm_name>_1);
67
+ clReleaseKernel(bones_kernel_<algorithm_name>_2);