bones-compiler 1.1.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (211) hide show
  1. checksums.yaml +15 -0
  2. data/CHANGELOG +37 -0
  3. data/LICENSE +1 -1
  4. data/README.rdoc +95 -70
  5. data/Rakefile +78 -3
  6. data/VERSION +1 -1
  7. data/bin/adarwin +17 -0
  8. data/examples/benchmarks/PolyBench/2mm.c +104 -0
  9. data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
  10. data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
  11. data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
  12. data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
  13. data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
  14. data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
  15. data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
  16. data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
  17. data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
  18. data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
  19. data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
  20. data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
  21. data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
  22. data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
  23. data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
  24. data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
  25. data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
  26. data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
  27. data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
  28. data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
  29. data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
  30. data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
  31. data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
  32. data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
  33. data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
  34. data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
  35. data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
  36. data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
  37. data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
  38. data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
  39. data/examples/benchmarks/Rodinia/cfd.c +180 -0
  40. data/examples/benchmarks/Rodinia/hotspot.c +228 -0
  41. data/examples/benchmarks/Rodinia/kmeans.c +164 -0
  42. data/examples/benchmarks/Rodinia/srad.c +188 -0
  43. data/examples/benchmarks/other/common.h +0 -0
  44. data/examples/benchmarks/other/dct.c +58 -0
  45. data/examples/benchmarks/other/mm.c +50 -0
  46. data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
  47. data/examples/chunk/{example1.c → example01.c} +0 -0
  48. data/examples/chunk/{example2.c → example02.c} +0 -0
  49. data/examples/chunk/{example3.c → example03.c} +0 -0
  50. data/examples/chunk/{example4.c → example04.c} +0 -0
  51. data/examples/chunk/{example5.c → example05.c} +0 -0
  52. data/examples/chunk/example06.c +45 -0
  53. data/examples/chunk/example07.c +49 -0
  54. data/examples/dependences/example01.c +42 -0
  55. data/examples/dependences/example02.c +40 -0
  56. data/examples/dependences/example03.c +43 -0
  57. data/examples/dependences/example04.c +44 -0
  58. data/examples/dependences/example05.c +42 -0
  59. data/examples/element/{example1.c → example01.c} +0 -0
  60. data/examples/element/{example2.c → example02.c} +2 -2
  61. data/examples/element/{example3.c → example03.c} +0 -0
  62. data/examples/element/{example4.c → example04.c} +0 -0
  63. data/examples/element/{example5.c → example05.c} +0 -0
  64. data/examples/element/{example6.c → example06.c} +0 -0
  65. data/examples/element/{example7.c → example07.c} +0 -0
  66. data/examples/element/{example8.c → example08.c} +0 -0
  67. data/examples/element/{example9.c → example09.c} +0 -0
  68. data/examples/element/example13.c +73 -0
  69. data/examples/fusion/example01.c +68 -0
  70. data/examples/fusion/example02.c +73 -0
  71. data/examples/fusion/example03.c +72 -0
  72. data/examples/fusion/example04.c +61 -0
  73. data/examples/fusion/example05.c +55 -0
  74. data/examples/neighbourhood/{example1.c → example01.c} +0 -0
  75. data/examples/neighbourhood/{example2.c → example02.c} +0 -0
  76. data/examples/neighbourhood/{example3.c → example03.c} +0 -0
  77. data/examples/neighbourhood/{example4.c → example04.c} +0 -0
  78. data/examples/neighbourhood/example05.c +44 -0
  79. data/examples/shared/{example1.c → example01.c} +0 -0
  80. data/examples/shared/{example2.c → example02.c} +0 -0
  81. data/examples/shared/{example3.c → example03.c} +0 -0
  82. data/examples/shared/{example4.c → example04.c} +0 -0
  83. data/examples/shared/{example5.c → example05.c} +0 -0
  84. data/lib/adarwin.rb +62 -0
  85. data/lib/adarwin/dependences.rb +268 -0
  86. data/lib/adarwin/engine.rb +277 -0
  87. data/lib/adarwin/fusion.rb +174 -0
  88. data/lib/adarwin/interval.rb +57 -0
  89. data/lib/adarwin/memorycopies.rb +153 -0
  90. data/lib/adarwin/nest.rb +225 -0
  91. data/lib/adarwin/preprocessor.rb +76 -0
  92. data/lib/adarwin/reference.rb +261 -0
  93. data/lib/bones.rb +4 -55
  94. data/lib/bones/algorithm.rb +77 -40
  95. data/lib/bones/copy.rb +26 -0
  96. data/lib/bones/engine.rb +147 -31
  97. data/lib/bones/preprocessor.rb +92 -12
  98. data/lib/bones/species.rb +4 -3
  99. data/lib/bones/structure.rb +14 -4
  100. data/lib/castaddon.rb +11 -6
  101. data/lib/castaddon/node_adarwin.rb +245 -0
  102. data/lib/castaddon/node_bones.rb +316 -0
  103. data/lib/castaddon/node_common.rb +289 -0
  104. data/lib/castaddon/transformations.rb +236 -0
  105. data/lib/common.rb +216 -0
  106. data/skeletons/CPU-C/common/header.c +3 -0
  107. data/skeletons/CPU-C/common/mem_global.c +0 -0
  108. data/skeletons/CPU-C/common/timer_2_start.c +11 -13
  109. data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
  110. data/skeletons/CPU-C/common/timer_globals.c +29 -0
  111. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
  112. data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
  114. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
  115. data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
  117. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
  118. data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
  119. data/skeletons/CPU-OPENMP/common/globals.c +1 -0
  120. data/skeletons/CPU-OPENMP/common/header.c +3 -0
  121. data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
  122. data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
  123. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
  124. data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
  125. data/skeletons/GPU-CUDA/common/globals.c +27 -3
  126. data/skeletons/GPU-CUDA/common/header.c +2 -0
  127. data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
  128. data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
  129. data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
  130. data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
  131. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
  132. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
  133. data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
  134. data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
  135. data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
  136. data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
  137. data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
  138. data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
  139. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
  140. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
  141. data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
  142. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
  143. data/skeletons/GPU-CUDA/skeletons.txt +6 -5
  144. data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
  145. data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
  146. data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
  147. data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
  148. data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
  149. data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
  150. data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
  151. data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
  152. data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
  153. data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
  154. data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
  155. data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
  156. data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
  157. data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
  158. data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
  159. data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
  160. data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
  161. data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
  162. data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
  163. data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
  164. data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
  165. data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
  166. data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
  167. data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
  168. data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
  169. data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
  170. data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
  171. data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
  172. data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
  173. data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
  174. data/test/examples/chunk/example01_species.c +58 -0
  175. data/test/examples/chunk/example02_species.c +48 -0
  176. data/test/examples/chunk/example03_species.c +63 -0
  177. data/test/examples/chunk/example04_species.c +58 -0
  178. data/test/examples/chunk/example05_species.c +56 -0
  179. data/test/examples/chunk/example06_species.c +49 -0
  180. data/test/examples/chunk/example07_species.c +53 -0
  181. data/test/examples/dependences/example01_species.c +46 -0
  182. data/test/examples/dependences/example02_species.c +44 -0
  183. data/test/examples/dependences/example03_species.c +47 -0
  184. data/test/examples/dependences/example04_species.c +48 -0
  185. data/test/examples/dependences/example05_species.c +46 -0
  186. data/test/examples/element/example01_species.c +50 -0
  187. data/test/examples/element/example02_species.c +50 -0
  188. data/test/examples/element/example03_species.c +62 -0
  189. data/test/examples/element/example04_species.c +53 -0
  190. data/test/examples/element/example05_species.c +59 -0
  191. data/test/examples/element/example06_species.c +50 -0
  192. data/test/examples/element/example07_species.c +58 -0
  193. data/test/examples/element/example08_species.c +49 -0
  194. data/test/examples/element/example09_species.c +52 -0
  195. data/test/examples/element/example10_species.c +54 -0
  196. data/test/examples/element/example11_species.c +51 -0
  197. data/test/examples/element/example12_species.c +60 -0
  198. data/test/examples/element/example13_species.c +77 -0
  199. data/test/examples/neighbourhood/example01_species.c +57 -0
  200. data/test/examples/neighbourhood/example02_species.c +56 -0
  201. data/test/examples/neighbourhood/example03_species.c +83 -0
  202. data/test/examples/neighbourhood/example04_species.c +55 -0
  203. data/test/examples/neighbourhood/example05_species.c +48 -0
  204. data/test/examples/shared/example01_species.c +49 -0
  205. data/test/examples/shared/example02_species.c +55 -0
  206. data/test/examples/shared/example03_species.c +59 -0
  207. data/test/examples/shared/example04_species.c +56 -0
  208. data/test/examples/shared/example05_species.c +52 -0
  209. metadata +193 -73
  210. data/examples/benchmarks/overview.txt +0 -38
  211. data/lib/castaddon/node.rb +0 -753
@@ -1,24 +1,42 @@
1
- #include <stdio.h>
2
- #include <cuda_runtime.h>
1
+
2
+ ////////////////////////////////////////
3
+ //////////// Globals ///////////////////
4
+ ////////////////////////////////////////
3
5
 
4
6
  #define BONES_MIN(a,b) ((a<b) ? a : b)
5
7
  #define BONES_MAX(a,b) ((a>b) ? a : b)
6
8
  #define DIV_CEIL(a,b) ((a+b-1)/b)
7
9
  #define DIV_FLOOR(a,b) (a/b)
8
10
 
9
- // Function to initialize the GPU (for fair measurements)
11
+ // CUDA timers
12
+ cudaEvent_t bones_start2;
13
+ cudaEvent_t bones_stop2;
14
+
15
+ // Function to initialize the GPU (for fair measurements, streams, timers)
10
16
  void bones_initialize_target(void) {
11
17
  int* bones_temporary = 0;
12
18
  cudaMalloc((void**)&bones_temporary, sizeof(int));
13
19
  cudaFree(bones_temporary);
20
+ cudaStreamCreate(&kernel_stream);
21
+ cudaEventCreate(&bones_start2);
22
+ cudaEventCreate(&bones_stop2);
14
23
  }
15
24
 
16
25
  // Declaration of the original function
17
26
  int bones_main(void);
18
27
 
28
+ ////////////////////////////////////////
29
+ //////////// Main function /////////////
30
+ ////////////////////////////////////////
31
+
19
32
  // New main function for initialisation and clean-up
20
33
  int main(void) {
21
34
 
35
+ // Initialisation of the scheduler
36
+ bones_initialize_scheduler();
37
+ pthread_t bones_scheduler_thread;
38
+ pthread_create(&bones_scheduler_thread, NULL, bones_scheduler, NULL);
39
+
22
40
  // Initialisation of the target
23
41
  bones_initialize_target();
24
42
 
@@ -26,6 +44,12 @@ int main(void) {
26
44
  int bones_return = bones_main();
27
45
 
28
46
  // Clean-up
47
+ bones_scheduler_done = 1;
48
+ pthread_join(bones_scheduler_thread, NULL);
49
+ cudaStreamDestroy(kernel_stream);
29
50
  return bones_return;
30
51
  }
31
52
 
53
+ ////////////////////////////////////////
54
+ ////////// Accelerated functions ///////
55
+ ////////////////////////////////////////
@@ -0,0 +1,2 @@
1
+ void bones_timer_start();
2
+ void bones_timer_stop();
@@ -0,0 +1,6 @@
1
+
2
+ // Create space for <array> on the device
3
+ void bones_alloc_<id>_<array>(void) {
4
+ cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
5
+ cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
6
+ }
@@ -0,0 +1,6 @@
1
+
2
+ // Copy <array> to the device
3
+ void bones_copy<direction>_<id>_<array>(<definition>) {
4
+ cudaStreamSynchronize(kernel_stream);
5
+ bones_memcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice, <state>, <index>);
6
+ }
@@ -0,0 +1,6 @@
1
+
2
+ // Copy <array> from device to host
3
+ void bones_copy<direction>_<id>_<array>(<definition>) {
4
+ cudaStreamSynchronize(kernel_stream);
5
+ bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <index>);
6
+ }
@@ -0,0 +1,6 @@
1
+
2
+ // Clean up array <array> from the device
3
+ void bones_free_<id>_<array>(void) {
4
+ cudaStreamSynchronize(kernel_stream);
5
+ cudaFree(device_<array>);
6
+ }
@@ -1,3 +1,4 @@
1
1
 
2
2
  // Copy <array> from device to host
3
- cudaMemcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost);
3
+ bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <state>);
4
+ bones_synchronize(<state>);
@@ -1,3 +1,4 @@
1
1
 
2
2
  // Copy <array> to the device
3
- cudaMemcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice);
3
+ bones_memcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice, <state>, <state>);
4
+ bones_synchronize(<state>);
@@ -0,0 +1 @@
1
+ <type>* device_<array>;
@@ -1,5 +1,4 @@
1
1
 
2
2
  // Create space for <array> on the device
3
- <type>* device_<array> = 0;
4
3
  cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
5
- cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
4
+ //cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
@@ -0,0 +1,86 @@
1
+
2
+ #include <stdio.h>
3
+ #include <pthread.h>
4
+
5
+ ////////////////////////////////////////
6
+ ////////// Thread scheduler ////////////
7
+ ////////////////////////////////////////
8
+
9
+ // Memory copy and kernel streams
10
+ cudaStream_t kernel_stream;
11
+ cudaStream_t memory_stream;
12
+
13
+ // Task structure
14
+ typedef struct {
15
+ void *dst;
16
+ void *src;
17
+ int size;
18
+ enum cudaMemcpyKind direction;
19
+ int deadline;
20
+ volatile int status;
21
+ } Task;
22
+
23
+ // Task list
24
+ #define BONES_MAX_TASKS 100
25
+ Task tasks[BONES_MAX_TASKS];
26
+
27
+ // Scheduler status
28
+ volatile int bones_scheduler_done;
29
+
30
+ // Create synchronisation points
31
+ void bones_synchronize(int deadline) {
32
+ cudaStreamSynchronize(kernel_stream);
33
+ printf("Reached: syncpoint %d [worker]\n",deadline); fflush(stdout);
34
+ for (int t = 0; t <= BONES_MAX_TASKS; t++) {
35
+ if (tasks[t].deadline == deadline && tasks[t].status == 1) {
36
+ while(tasks[t].status != 2) { }
37
+ }
38
+ }
39
+ printf("Reached: syncpoint %d [all]\n",deadline); fflush(stdout);
40
+ }
41
+
42
+ // Add a new task
43
+ void bones_memcpy(void *dst, void *src, int size, enum cudaMemcpyKind direction, int deadline, int task_id) {
44
+ Task new_task = { .dst = dst, .src = src, .size = size, .direction = direction, .deadline = deadline, .status = 1 };
45
+ tasks[task_id] = new_task;
46
+ }
47
+
48
+ // Perform a task (CUDA memory copy)
49
+ void bones_scheduler_copy(Task current_task) {
50
+ usleep(400);
51
+ cudaMemcpyAsync(current_task.dst, current_task.src, current_task.size, current_task.direction, memory_stream);
52
+ cudaStreamSynchronize(memory_stream);
53
+ }
54
+
55
+ // Initialize the scheduler
56
+ void bones_initialize_scheduler(void) {
57
+ bones_scheduler_done = 0;
58
+ }
59
+
60
+ // The scheduler (infinite loop)
61
+ #define LARGE_INT 1000
62
+ void* bones_scheduler(void* ptr) {
63
+ cudaStreamCreate(&memory_stream);
64
+ while (bones_scheduler_done != 1) {
65
+
66
+ // Find the ready task with the earliest deadline
67
+ int found_deadline = LARGE_INT;
68
+ int found_task = LARGE_INT;
69
+ for (int t = 0; t <= BONES_MAX_TASKS; t++) {
70
+ if (tasks[t].status == 1) {
71
+ if (tasks[t].deadline < found_deadline) {
72
+ found_task = t;
73
+ found_deadline = tasks[t].deadline;
74
+ }
75
+ }
76
+ }
77
+
78
+ // Perform the found task
79
+ if (found_task != LARGE_INT) {
80
+ printf("Performing task %d, dl %d [scheduler]\n",found_task,tasks[found_task].deadline);
81
+ bones_scheduler_copy(tasks[found_task]);
82
+ tasks[found_task].status = 2;
83
+ }
84
+ }
85
+ cudaStreamDestroy(memory_stream);
86
+ }
@@ -1,6 +1,4 @@
1
1
 
2
2
  // Start the timer for the measurement of the kernel execution time
3
- cudaThreadSynchronize();
4
- cudaEvent_t bones_start2;
5
- cudaEventCreate(&bones_start2);
6
- cudaEventRecord(bones_start2,0);
3
+ //cudaStreamSynchronize(kernel_stream);
4
+ cudaEventRecord(bones_start2,kernel_stream);
@@ -1,10 +1,8 @@
1
1
 
2
2
  // Stop the timer for the measurement of the kernel execution time
3
- cudaThreadSynchronize();
4
- cudaEvent_t bones_stop2;
5
- cudaEventCreate(&bones_stop2);
6
- cudaEventRecord(bones_stop2,0);
3
+ //cudaStreamSynchronize(kernel_stream);
4
+ cudaEventRecord(bones_stop2,kernel_stream);
7
5
  cudaEventSynchronize(bones_stop2);
8
6
  float bones_timer2 = 0;
9
7
  cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
10
- printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
8
+ printf(">>>\t\t Execution time [kernel <algorithm_basename>]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,26 @@
1
+
2
+ ////////////////////////////////////////
3
+ //////////// Timers ////////////////////
4
+ ////////////////////////////////////////
5
+
6
+ // Timer
7
+ cudaEvent_t bones_start1;
8
+
9
+ // Start the timer for the measurement of the whole scop
10
+ void bones_timer_start() {
11
+ cudaDeviceSynchronize();
12
+ cudaEventCreate(&bones_start1);
13
+ cudaEventRecord(bones_start1,kernel_stream);
14
+ }
15
+
16
+ // End the timer for the measurement of the whole scop
17
+ void bones_timer_stop() {
18
+ cudaDeviceSynchronize();
19
+ cudaEvent_t bones_stop1;
20
+ cudaEventCreate(&bones_stop1);
21
+ cudaEventRecord(bones_stop1,kernel_stream);
22
+ cudaEventSynchronize(bones_stop1);
23
+ float bones_timer1 = 0;
24
+ cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
25
+ printf(">>>\t\t Execution time [full scop]: %.3lf ms \n", bones_timer1);
26
+ }
@@ -72,11 +72,9 @@ __global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <i
72
72
  // Function to start the kernel
73
73
  extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
74
74
  int bones_block_size;
75
- if (<parallelism> >= 64*512) { bones_block_size = 512;}
76
- else if (<parallelism> >= 64*256) { bones_block_size = 256;}
77
- else if (<parallelism> >= 64*128) { bones_block_size = 128;}
78
- else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
79
- else { bones_block_size = 32; }
75
+ if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
76
+ else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
77
+ else { bones_block_size = 128; }
80
78
 
81
79
  // First perform some pre-shuffling (for the first input)
82
80
  <in0_type>* shuffled_<in0_name> = 0;
@@ -86,7 +84,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
86
84
  bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
87
85
  <in0_type>* temp_<in0_name> = <in0_name>;
88
86
  <in0_name> = shuffled_<in0_name>;
89
- cudaFree(temp_<in0_name>);
87
+ //cudaFree(temp_<in0_name>);
90
88
 
91
89
  // First perform some pre-shuffling (for the second input)
92
90
  <in0_type>* shuffled_<in1_name> = 0;
@@ -96,7 +94,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
96
94
  bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
97
95
  <in1_type>* temp_<in1_name> = <in1_name>;
98
96
  <in1_name> = shuffled_<in1_name>;
99
- cudaFree(temp_<in1_name>);
97
+ //cudaFree(temp_<in1_name>);
100
98
 
101
99
  // Then run the original kernel
102
100
  dim3 bones_threads0(bones_block_size);
@@ -46,11 +46,9 @@ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <i
46
46
  // Function to start the kernel
47
47
  extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
48
48
  int bones_block_size;
49
- if (<parallelism> >= 64*512) { bones_block_size = 512;}
50
- else if (<parallelism> >= 64*256) { bones_block_size = 256;}
51
- else if (<parallelism> >= 64*128) { bones_block_size = 128;}
52
- else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
53
- else { bones_block_size = 32; }
49
+ if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
50
+ else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
51
+ else { bones_block_size = 128; }
54
52
 
55
53
  // First perform some pre-shuffling
56
54
  <in0_type>* shuffled_<in0_name> = 0;
@@ -60,7 +58,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
60
58
  bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
61
59
  <in0_type>* temp_<in0_name> = <in0_name>;
62
60
  <in0_name> = shuffled_<in0_name>;
63
- cudaFree(temp_<in0_name>);
61
+ //cudaFree(temp_<in0_name>);
64
62
 
65
63
  // Then run the original kernel
66
64
  dim3 bones_threads0(bones_block_size);
@@ -1,3 +1,3 @@
1
1
 
2
2
  // Start the CUDA function
3
- bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
3
+ bones_prekernel_<algorithm_name>_0(kernel_stream, <devicenames>, <argument_name>);
@@ -1,5 +1,5 @@
1
1
  /* STARTDEF
2
- void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
2
+ void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>);
3
3
  ENDDEF */
4
4
  // Start of the <algorithm_name> kernel
5
5
  __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
@@ -15,14 +15,12 @@ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_d
15
15
  }
16
16
 
17
17
  // Function to start the kernel
18
- extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
18
+ extern "C" void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>) {
19
19
  int bones_block_size;
20
- if (<parallelism> >= 64*512) { bones_block_size = 512;}
21
- else if (<parallelism> >= 64*256) { bones_block_size = 256;}
22
- else if (<parallelism> >= 64*128) { bones_block_size = 128;}
23
- else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
24
- else { bones_block_size = 32; }
20
+ if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
21
+ else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
22
+ else { bones_block_size = 128; }
25
23
  dim3 bones_threads(bones_block_size);
26
24
  dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
27
- bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
25
+ bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads, 0, kernel_stream >>>(<names>, <argument_name>);
28
26
  }
@@ -19,12 +19,13 @@ N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :defa
19
19
  D|chunk(D)+ -> D|element+ :default :00
20
20
  D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
21
21
  D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
22
- N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
23
- D|neighbourhood(D)+ -> D|element+ :default :00
24
- D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
22
+ D|neighbourhood(D)+ -> D|element+ :default :40
23
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :40
25
24
  D|element+ -> D|chunk(D)+ :default :00
26
- D|element+ -> D|element+ :default :00
25
+ D|element+ -> D|element+ :default :40
27
26
  D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
28
27
  D|element+ -> D|shared+ :default :08
29
28
  D|element+ -> D|element+ ^ D|shared+ :default :08
30
- D|void -> D|element+ :default :00
29
+ D|void -> D|element+ :default :40
30
+
31
+ N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
@@ -42,26 +42,30 @@ int main(void) {
42
42
  for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
43
43
 
44
44
  // Perform the computation (E := alpha*A*B*C + beta*D)
45
- #pragma species kernel 0:NI-1,0:NK-1|chunk(0:0,0:NK-1) ^ 0:NK-1,0:NJ-1|chunk(0:NK-1,0:0) -> 0:NI-1,0:NJ-1|element
46
- for (i=0; i<NI; i++) {
47
- for (j=0; j<NJ; j++) {
48
- tmp[i][j] = 0;
49
- for (k=0; k<NK; k++) {
50
- tmp[i][j] += alpha * A[i][k] * B[k][j];
45
+ #pragma scop
46
+ {
47
+ #pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> tmp[0:NI-1,0:NJ-1]|element
48
+ for (i = 0; i < NI; i++) {
49
+ for (j = 0; j < NJ; j++) {
50
+ tmp[i][j] = 0;
51
+ for (k = 0; k < NK; k++) {
52
+ tmp[i][j] += alpha * A[i][k] * B[k][j];
53
+ }
51
54
  }
52
55
  }
53
- }
54
- #pragma species endkernel 2mm-part1
55
- #pragma species kernel 0:NI-1,0:NL-1|element ^ 0:NI-1,0:NJ-1|chunk(0:0,0:NJ-1) ^ 0:NJ-1,0:NL-1|chunk(0:NJ-1,0:0) -> 0:NI-1,0:NL-1|element
56
- for (i=0; i<NI; i++) {
57
- for (j=0; j<NL; j++) {
58
- D[i][j] *= beta;
59
- for (k=0; k<NJ; k++) {
60
- D[i][j] += tmp[i][k] * C[k][j];
56
+ #pragma species endkernel 2mm_k1
57
+ #pragma species kernel D[0:NI-1,0:NL-1]|element ^ tmp[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ C[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> D[0:NI-1,0:NL-1]|element
58
+ for (i = 0; i < NI; i++) {
59
+ for (j = 0; j < NL; j++) {
60
+ D[i][j] *= beta;
61
+ for (k = 0; k < NJ; k++) {
62
+ D[i][j] += tmp[i][k] * C[k][j];
63
+ }
61
64
  }
62
65
  }
66
+ #pragma species endkernel 2mm_k2
63
67
  }
64
- #pragma species endkernel 2mm-part2
68
+ #pragma endscop
65
69
 
66
70
  // Clean-up and exit the function
67
71
  fflush(stdout);
@@ -0,0 +1,82 @@
1
+ //
2
+ // This file is part of the Bones source-to-source compiler examples. The C-code
3
+ // is largely identical in terms of functionality and variable naming to the code
4
+ // found in PolyBench/C version 3.2. For more information on PolyBench/C or Bones
5
+ // please use the contact information below.
6
+ //
7
+ // == More information on PolyBench/C
8
+ // Contact............Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
9
+ // Web address........http://polybench.sourceforge.net/
10
+ //
11
+ // == More information on Bones
12
+ // Contact............Cedric Nugteren <c.nugteren@tue.nl>
13
+ // Web address........http://parse.ele.tue.nl/bones/
14
+ //
15
+ // == File information
16
+ // Filename...........benchmark/3mm.c
17
+ // Author.............Cedric Nugteren
18
+ // Last modified on...03-April-2012
19
+ //
20
+
21
+ #include "common.h"
22
+
23
+ // This is '3mm', a 3 matrix multiply kernel
24
+ int main(void) {
25
+ int i,j,k;
26
+
27
+ // Declare arrays on the stack
28
+ float A[NI][NK];
29
+ float B[NK][NJ];
30
+ float C[NJ][NM];
31
+ float D[NM][NL];
32
+ float E[NI][NJ];
33
+ float F[NJ][NL];
34
+ float G[NI][NL];
35
+
36
+ // Set the input data
37
+ for (i=0; i<NI; i++) { for (j=0; j<NK; j++) { A[i][j] = ((float) i*j) / NI; } }
38
+ for (i=0; i<NK; i++) { for (j=0; j<NJ; j++) { B[i][j] = ((float) i*(j+1)) / NJ; } }
39
+ for (i=0; i<NL; i++) { for (j=0; j<NJ; j++) { C[i][j] = ((float) i*(j+3)) / NL; } }
40
+ for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
41
+
42
+ // Perform the computation (G := E*F, with E := A*B and F := C*D)
43
+ #pragma scop
44
+ {
45
+ #pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> E[0:NI-1,0:NJ-1]|element
46
+ for (i = 0; i < NI; i++) {
47
+ for (j = 0; j < NJ; j++) {
48
+ E[i][j] = 0;
49
+ for (k = 0; k < NK; k++) {
50
+ E[i][j] += A[i][k] * B[k][j];
51
+ }
52
+ }
53
+ }
54
+ #pragma species endkernel 3mm_k1
55
+ #pragma species kernel C[0:NJ-1,0:NM-1]|chunk(0:0,0:NM-1) ^ D[0:NM-1,0:NL-1]|chunk(0:NM-1,0:0) -> F[0:NJ-1,0:NL-1]|element
56
+ for (i = 0; i < NJ; i++) {
57
+ for (j = 0; j < NL; j++) {
58
+ F[i][j] = 0;
59
+ for (k = 0; k < NM; k++) {
60
+ F[i][j] += C[i][k] * D[k][j];
61
+ }
62
+ }
63
+ }
64
+ #pragma species endkernel 3mm_k2
65
+ #pragma species kernel E[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ F[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> G[0:NI-1,0:NL-1]|element
66
+ for (i = 0; i < NI; i++) {
67
+ for (j = 0; j < NL; j++) {
68
+ G[i][j] = 0;
69
+ for (k = 0; k < NJ; k++) {
70
+ G[i][j] += E[i][k] * F[k][j];
71
+ }
72
+ }
73
+ }
74
+ #pragma species endkernel 3mm_k3
75
+ }
76
+ #pragma endscop
77
+
78
+ // Clean-up and exit the function
79
+ fflush(stdout);
80
+ return 0;
81
+ }
82
+