bones-compiler 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. checksums.yaml +15 -0
  2. data/CHANGELOG +37 -0
  3. data/LICENSE +1 -1
  4. data/README.rdoc +95 -70
  5. data/Rakefile +78 -3
  6. data/VERSION +1 -1
  7. data/bin/adarwin +17 -0
  8. data/examples/benchmarks/PolyBench/2mm.c +104 -0
  9. data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
  10. data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
  11. data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
  12. data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
  13. data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
  14. data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
  15. data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
  16. data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
  17. data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
  18. data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
  19. data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
  20. data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
  21. data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
  22. data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
  23. data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
  24. data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
  25. data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
  26. data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
  27. data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
  28. data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
  29. data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
  30. data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
  31. data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
  32. data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
  33. data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
  34. data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
  35. data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
  36. data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
  37. data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
  38. data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
  39. data/examples/benchmarks/Rodinia/cfd.c +180 -0
  40. data/examples/benchmarks/Rodinia/hotspot.c +228 -0
  41. data/examples/benchmarks/Rodinia/kmeans.c +164 -0
  42. data/examples/benchmarks/Rodinia/srad.c +188 -0
  43. data/examples/benchmarks/other/common.h +0 -0
  44. data/examples/benchmarks/other/dct.c +58 -0
  45. data/examples/benchmarks/other/mm.c +50 -0
  46. data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
  47. data/examples/chunk/{example1.c → example01.c} +0 -0
  48. data/examples/chunk/{example2.c → example02.c} +0 -0
  49. data/examples/chunk/{example3.c → example03.c} +0 -0
  50. data/examples/chunk/{example4.c → example04.c} +0 -0
  51. data/examples/chunk/{example5.c → example05.c} +0 -0
  52. data/examples/chunk/example06.c +45 -0
  53. data/examples/chunk/example07.c +49 -0
  54. data/examples/dependences/example01.c +42 -0
  55. data/examples/dependences/example02.c +40 -0
  56. data/examples/dependences/example03.c +43 -0
  57. data/examples/dependences/example04.c +44 -0
  58. data/examples/dependences/example05.c +42 -0
  59. data/examples/element/{example1.c → example01.c} +0 -0
  60. data/examples/element/{example2.c → example02.c} +2 -2
  61. data/examples/element/{example3.c → example03.c} +0 -0
  62. data/examples/element/{example4.c → example04.c} +0 -0
  63. data/examples/element/{example5.c → example05.c} +0 -0
  64. data/examples/element/{example6.c → example06.c} +0 -0
  65. data/examples/element/{example7.c → example07.c} +0 -0
  66. data/examples/element/{example8.c → example08.c} +0 -0
  67. data/examples/element/{example9.c → example09.c} +0 -0
  68. data/examples/element/example13.c +73 -0
  69. data/examples/fusion/example01.c +68 -0
  70. data/examples/fusion/example02.c +73 -0
  71. data/examples/fusion/example03.c +72 -0
  72. data/examples/fusion/example04.c +61 -0
  73. data/examples/fusion/example05.c +55 -0
  74. data/examples/neighbourhood/{example1.c → example01.c} +0 -0
  75. data/examples/neighbourhood/{example2.c → example02.c} +0 -0
  76. data/examples/neighbourhood/{example3.c → example03.c} +0 -0
  77. data/examples/neighbourhood/{example4.c → example04.c} +0 -0
  78. data/examples/neighbourhood/example05.c +44 -0
  79. data/examples/shared/{example1.c → example01.c} +0 -0
  80. data/examples/shared/{example2.c → example02.c} +0 -0
  81. data/examples/shared/{example3.c → example03.c} +0 -0
  82. data/examples/shared/{example4.c → example04.c} +0 -0
  83. data/examples/shared/{example5.c → example05.c} +0 -0
  84. data/lib/adarwin.rb +62 -0
  85. data/lib/adarwin/dependences.rb +268 -0
  86. data/lib/adarwin/engine.rb +277 -0
  87. data/lib/adarwin/fusion.rb +174 -0
  88. data/lib/adarwin/interval.rb +57 -0
  89. data/lib/adarwin/memorycopies.rb +153 -0
  90. data/lib/adarwin/nest.rb +225 -0
  91. data/lib/adarwin/preprocessor.rb +76 -0
  92. data/lib/adarwin/reference.rb +261 -0
  93. data/lib/bones.rb +4 -55
  94. data/lib/bones/algorithm.rb +77 -40
  95. data/lib/bones/copy.rb +26 -0
  96. data/lib/bones/engine.rb +147 -31
  97. data/lib/bones/preprocessor.rb +92 -12
  98. data/lib/bones/species.rb +4 -3
  99. data/lib/bones/structure.rb +14 -4
  100. data/lib/castaddon.rb +11 -6
  101. data/lib/castaddon/node_adarwin.rb +245 -0
  102. data/lib/castaddon/node_bones.rb +316 -0
  103. data/lib/castaddon/node_common.rb +289 -0
  104. data/lib/castaddon/transformations.rb +236 -0
  105. data/lib/common.rb +216 -0
  106. data/skeletons/CPU-C/common/header.c +3 -0
  107. data/skeletons/CPU-C/common/mem_global.c +0 -0
  108. data/skeletons/CPU-C/common/timer_2_start.c +11 -13
  109. data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
  110. data/skeletons/CPU-C/common/timer_globals.c +29 -0
  111. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
  112. data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
  114. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
  115. data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
  117. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
  118. data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
  119. data/skeletons/CPU-OPENMP/common/globals.c +1 -0
  120. data/skeletons/CPU-OPENMP/common/header.c +3 -0
  121. data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
  122. data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
  123. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
  124. data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
  125. data/skeletons/GPU-CUDA/common/globals.c +27 -3
  126. data/skeletons/GPU-CUDA/common/header.c +2 -0
  127. data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
  128. data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
  129. data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
  130. data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
  131. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
  132. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
  133. data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
  134. data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
  135. data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
  136. data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
  137. data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
  138. data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
  139. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
  140. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
  141. data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
  142. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
  143. data/skeletons/GPU-CUDA/skeletons.txt +6 -5
  144. data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
  145. data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
  146. data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
  147. data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
  148. data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
  149. data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
  150. data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
  151. data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
  152. data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
  153. data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
  154. data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
  155. data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
  156. data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
  157. data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
  158. data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
  159. data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
  160. data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
  161. data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
  162. data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
  163. data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
  164. data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
  165. data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
  166. data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
  167. data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
  168. data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
  169. data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
  170. data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
  171. data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
  172. data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
  173. data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
  174. data/test/examples/chunk/example01_species.c +58 -0
  175. data/test/examples/chunk/example02_species.c +48 -0
  176. data/test/examples/chunk/example03_species.c +63 -0
  177. data/test/examples/chunk/example04_species.c +58 -0
  178. data/test/examples/chunk/example05_species.c +56 -0
  179. data/test/examples/chunk/example06_species.c +49 -0
  180. data/test/examples/chunk/example07_species.c +53 -0
  181. data/test/examples/dependences/example01_species.c +46 -0
  182. data/test/examples/dependences/example02_species.c +44 -0
  183. data/test/examples/dependences/example03_species.c +47 -0
  184. data/test/examples/dependences/example04_species.c +48 -0
  185. data/test/examples/dependences/example05_species.c +46 -0
  186. data/test/examples/element/example01_species.c +50 -0
  187. data/test/examples/element/example02_species.c +50 -0
  188. data/test/examples/element/example03_species.c +62 -0
  189. data/test/examples/element/example04_species.c +53 -0
  190. data/test/examples/element/example05_species.c +59 -0
  191. data/test/examples/element/example06_species.c +50 -0
  192. data/test/examples/element/example07_species.c +58 -0
  193. data/test/examples/element/example08_species.c +49 -0
  194. data/test/examples/element/example09_species.c +52 -0
  195. data/test/examples/element/example10_species.c +54 -0
  196. data/test/examples/element/example11_species.c +51 -0
  197. data/test/examples/element/example12_species.c +60 -0
  198. data/test/examples/element/example13_species.c +77 -0
  199. data/test/examples/neighbourhood/example01_species.c +57 -0
  200. data/test/examples/neighbourhood/example02_species.c +56 -0
  201. data/test/examples/neighbourhood/example03_species.c +83 -0
  202. data/test/examples/neighbourhood/example04_species.c +55 -0
  203. data/test/examples/neighbourhood/example05_species.c +48 -0
  204. data/test/examples/shared/example01_species.c +49 -0
  205. data/test/examples/shared/example02_species.c +55 -0
  206. data/test/examples/shared/example03_species.c +59 -0
  207. data/test/examples/shared/example04_species.c +56 -0
  208. data/test/examples/shared/example05_species.c +52 -0
  209. metadata +193 -73
  210. data/examples/benchmarks/overview.txt +0 -38
  211. data/lib/castaddon/node.rb +0 -753
@@ -1,24 +1,42 @@
1
- #include <stdio.h>
2
- #include <cuda_runtime.h>
1
+
2
+ ////////////////////////////////////////
3
+ //////////// Globals ///////////////////
4
+ ////////////////////////////////////////
3
5
 
4
6
  #define BONES_MIN(a,b) ((a<b) ? a : b)
5
7
  #define BONES_MAX(a,b) ((a>b) ? a : b)
6
8
  #define DIV_CEIL(a,b) ((a+b-1)/b)
7
9
  #define DIV_FLOOR(a,b) (a/b)
8
10
 
9
- // Function to initialize the GPU (for fair measurements)
11
+ // CUDA timers
12
+ cudaEvent_t bones_start2;
13
+ cudaEvent_t bones_stop2;
14
+
15
+ // Function to initialize the GPU (for fair measurements, streams, timers)
10
16
  void bones_initialize_target(void) {
11
17
  int* bones_temporary = 0;
12
18
  cudaMalloc((void**)&bones_temporary, sizeof(int));
13
19
  cudaFree(bones_temporary);
20
+ cudaStreamCreate(&kernel_stream);
21
+ cudaEventCreate(&bones_start2);
22
+ cudaEventCreate(&bones_stop2);
14
23
  }
15
24
 
16
25
  // Declaration of the original function
17
26
  int bones_main(void);
18
27
 
28
+ ////////////////////////////////////////
29
+ //////////// Main function /////////////
30
+ ////////////////////////////////////////
31
+
19
32
  // New main function for initialisation and clean-up
20
33
  int main(void) {
21
34
 
35
+ // Initialisation of the scheduler
36
+ bones_initialize_scheduler();
37
+ pthread_t bones_scheduler_thread;
38
+ pthread_create(&bones_scheduler_thread, NULL, bones_scheduler, NULL);
39
+
22
40
  // Initialisation of the target
23
41
  bones_initialize_target();
24
42
 
@@ -26,6 +44,12 @@ int main(void) {
26
44
  int bones_return = bones_main();
27
45
 
28
46
  // Clean-up
47
+ bones_scheduler_done = 1;
48
+ pthread_join(bones_scheduler_thread, NULL);
49
+ cudaStreamDestroy(kernel_stream);
29
50
  return bones_return;
30
51
  }
31
52
 
53
+ ////////////////////////////////////////
54
+ ////////// Accelerated functions ///////
55
+ ////////////////////////////////////////
@@ -0,0 +1,2 @@
1
+ void bones_timer_start();
2
+ void bones_timer_stop();
@@ -0,0 +1,6 @@
1
+
2
+ // Create space for <array> on the device
3
+ void bones_alloc_<id>_<array>(void) {
4
+ cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
5
+ cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
6
+ }
@@ -0,0 +1,6 @@
1
+
2
+ // Copy <array> to the device
3
+ void bones_copy<direction>_<id>_<array>(<definition>) {
4
+ cudaStreamSynchronize(kernel_stream);
5
+ bones_memcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice, <state>, <index>);
6
+ }
@@ -0,0 +1,6 @@
1
+
2
+ // Copy <array> from device to host
3
+ void bones_copy<direction>_<id>_<array>(<definition>) {
4
+ cudaStreamSynchronize(kernel_stream);
5
+ bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <index>);
6
+ }
@@ -0,0 +1,6 @@
1
+
2
+ // Clean up array <array> from the device
3
+ void bones_free_<id>_<array>(void) {
4
+ cudaStreamSynchronize(kernel_stream);
5
+ cudaFree(device_<array>);
6
+ }
@@ -1,3 +1,4 @@
1
1
 
2
2
  // Copy <array> from device to host
3
- cudaMemcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost);
3
+ bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <state>);
4
+ bones_synchronize(<state>);
@@ -1,3 +1,4 @@
1
1
 
2
2
  // Copy <array> to the device
3
- cudaMemcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice);
3
+ bones_memcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice, <state>, <state>);
4
+ bones_synchronize(<state>);
@@ -0,0 +1 @@
1
+ <type>* device_<array>;
@@ -1,5 +1,4 @@
1
1
 
2
2
  // Create space for <array> on the device
3
- <type>* device_<array> = 0;
4
3
  cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
5
- cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
4
+ //cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
@@ -0,0 +1,86 @@
1
+
2
+ #include <stdio.h>
3
+ #include <pthread.h>
4
+
5
+ ////////////////////////////////////////
6
+ ////////// Thread scheduler ////////////
7
+ ////////////////////////////////////////
8
+
9
+ // Memory copy and kernel streams
10
+ cudaStream_t kernel_stream;
11
+ cudaStream_t memory_stream;
12
+
13
+ // Task structure
14
+ typedef struct {
15
+ void *dst;
16
+ void *src;
17
+ int size;
18
+ enum cudaMemcpyKind direction;
19
+ int deadline;
20
+ volatile int status;
21
+ } Task;
22
+
23
+ // Task list
24
+ #define BONES_MAX_TASKS 100
25
+ Task tasks[BONES_MAX_TASKS];
26
+
27
+ // Scheduler status
28
+ volatile int bones_scheduler_done;
29
+
30
+ // Create synchronisation points
31
+ void bones_synchronize(int deadline) {
32
+ cudaStreamSynchronize(kernel_stream);
33
+ printf("Reached: syncpoint %d [worker]\n",deadline); fflush(stdout);
34
+ for (int t = 0; t <= BONES_MAX_TASKS; t++) {
35
+ if (tasks[t].deadline == deadline && tasks[t].status == 1) {
36
+ while(tasks[t].status != 2) { }
37
+ }
38
+ }
39
+ printf("Reached: syncpoint %d [all]\n",deadline); fflush(stdout);
40
+ }
41
+
42
+ // Add a new task
43
+ void bones_memcpy(void *dst, void *src, int size, enum cudaMemcpyKind direction, int deadline, int task_id) {
44
+ Task new_task = { .dst = dst, .src = src, .size = size, .direction = direction, .deadline = deadline, .status = 1 };
45
+ tasks[task_id] = new_task;
46
+ }
47
+
48
+ // Perform a task (CUDA memory copy)
49
+ void bones_scheduler_copy(Task current_task) {
50
+ usleep(400);
51
+ cudaMemcpyAsync(current_task.dst, current_task.src, current_task.size, current_task.direction, memory_stream);
52
+ cudaStreamSynchronize(memory_stream);
53
+ }
54
+
55
+ // Initialize the scheduler
56
+ void bones_initialize_scheduler(void) {
57
+ bones_scheduler_done = 0;
58
+ }
59
+
60
+ // The scheduler (infinite loop)
61
+ #define LARGE_INT 1000
62
+ void* bones_scheduler(void* ptr) {
63
+ cudaStreamCreate(&memory_stream);
64
+ while (bones_scheduler_done != 1) {
65
+
66
+ // Find the ready task with the earliest deadline
67
+ int found_deadline = LARGE_INT;
68
+ int found_task = LARGE_INT;
69
+ for (int t = 0; t <= BONES_MAX_TASKS; t++) {
70
+ if (tasks[t].status == 1) {
71
+ if (tasks[t].deadline < found_deadline) {
72
+ found_task = t;
73
+ found_deadline = tasks[t].deadline;
74
+ }
75
+ }
76
+ }
77
+
78
+ // Perform the found task
79
+ if (found_task != LARGE_INT) {
80
+ printf("Performing task %d, dl %d [scheduler]\n",found_task,tasks[found_task].deadline);
81
+ bones_scheduler_copy(tasks[found_task]);
82
+ tasks[found_task].status = 2;
83
+ }
84
+ }
85
+ cudaStreamDestroy(memory_stream);
86
+ }
@@ -1,6 +1,4 @@
1
1
 
2
2
  // Start the timer for the measurement of the kernel execution time
3
- cudaThreadSynchronize();
4
- cudaEvent_t bones_start2;
5
- cudaEventCreate(&bones_start2);
6
- cudaEventRecord(bones_start2,0);
3
+ //cudaStreamSynchronize(kernel_stream);
4
+ cudaEventRecord(bones_start2,kernel_stream);
@@ -1,10 +1,8 @@
1
1
 
2
2
  // Stop the timer for the measurement of the kernel execution time
3
- cudaThreadSynchronize();
4
- cudaEvent_t bones_stop2;
5
- cudaEventCreate(&bones_stop2);
6
- cudaEventRecord(bones_stop2,0);
3
+ //cudaStreamSynchronize(kernel_stream);
4
+ cudaEventRecord(bones_stop2,kernel_stream);
7
5
  cudaEventSynchronize(bones_stop2);
8
6
  float bones_timer2 = 0;
9
7
  cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
10
- printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
8
+ printf(">>>\t\t Execution time [kernel <algorithm_basename>]: %.3lf ms \n", bones_timer2);
@@ -0,0 +1,26 @@
1
+
2
+ ////////////////////////////////////////
3
+ //////////// Timers ////////////////////
4
+ ////////////////////////////////////////
5
+
6
+ // Timer
7
+ cudaEvent_t bones_start1;
8
+
9
+ // Start the timer for the measurement of the whole scop
10
+ void bones_timer_start() {
11
+ cudaDeviceSynchronize();
12
+ cudaEventCreate(&bones_start1);
13
+ cudaEventRecord(bones_start1,kernel_stream);
14
+ }
15
+
16
+ // End the timer for the measurement of the whole scop
17
+ void bones_timer_stop() {
18
+ cudaDeviceSynchronize();
19
+ cudaEvent_t bones_stop1;
20
+ cudaEventCreate(&bones_stop1);
21
+ cudaEventRecord(bones_stop1,kernel_stream);
22
+ cudaEventSynchronize(bones_stop1);
23
+ float bones_timer1 = 0;
24
+ cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
25
+ printf(">>>\t\t Execution time [full scop]: %.3lf ms \n", bones_timer1);
26
+ }
@@ -72,11 +72,9 @@ __global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <i
72
72
  // Function to start the kernel
73
73
  extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
74
74
  int bones_block_size;
75
- if (<parallelism> >= 64*512) { bones_block_size = 512;}
76
- else if (<parallelism> >= 64*256) { bones_block_size = 256;}
77
- else if (<parallelism> >= 64*128) { bones_block_size = 128;}
78
- else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
79
- else { bones_block_size = 32; }
75
+ if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
76
+ else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
77
+ else { bones_block_size = 128; }
80
78
 
81
79
  // First perform some pre-shuffling (for the first input)
82
80
  <in0_type>* shuffled_<in0_name> = 0;
@@ -86,7 +84,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
86
84
  bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
87
85
  <in0_type>* temp_<in0_name> = <in0_name>;
88
86
  <in0_name> = shuffled_<in0_name>;
89
- cudaFree(temp_<in0_name>);
87
+ //cudaFree(temp_<in0_name>);
90
88
 
91
89
  // First perform some pre-shuffling (for the second input)
92
90
  <in0_type>* shuffled_<in1_name> = 0;
@@ -96,7 +94,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
96
94
  bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
97
95
  <in1_type>* temp_<in1_name> = <in1_name>;
98
96
  <in1_name> = shuffled_<in1_name>;
99
- cudaFree(temp_<in1_name>);
97
+ //cudaFree(temp_<in1_name>);
100
98
 
101
99
  // Then run the original kernel
102
100
  dim3 bones_threads0(bones_block_size);
@@ -46,11 +46,9 @@ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <i
46
46
  // Function to start the kernel
47
47
  extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
48
48
  int bones_block_size;
49
- if (<parallelism> >= 64*512) { bones_block_size = 512;}
50
- else if (<parallelism> >= 64*256) { bones_block_size = 256;}
51
- else if (<parallelism> >= 64*128) { bones_block_size = 128;}
52
- else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
53
- else { bones_block_size = 32; }
49
+ if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
50
+ else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
51
+ else { bones_block_size = 128; }
54
52
 
55
53
  // First perform some pre-shuffling
56
54
  <in0_type>* shuffled_<in0_name> = 0;
@@ -60,7 +58,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
60
58
  bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
61
59
  <in0_type>* temp_<in0_name> = <in0_name>;
62
60
  <in0_name> = shuffled_<in0_name>;
63
- cudaFree(temp_<in0_name>);
61
+ //cudaFree(temp_<in0_name>);
64
62
 
65
63
  // Then run the original kernel
66
64
  dim3 bones_threads0(bones_block_size);
@@ -1,3 +1,3 @@
1
1
 
2
2
  // Start the CUDA function
3
- bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);
3
+ bones_prekernel_<algorithm_name>_0(kernel_stream, <devicenames>, <argument_name>);
@@ -1,5 +1,5 @@
1
1
  /* STARTDEF
2
- void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
2
+ void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>);
3
3
  ENDDEF */
4
4
  // Start of the <algorithm_name> kernel
5
5
  __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
@@ -15,14 +15,12 @@ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_d
15
15
  }
16
16
 
17
17
  // Function to start the kernel
18
- extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
18
+ extern "C" void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>) {
19
19
  int bones_block_size;
20
- if (<parallelism> >= 64*512) { bones_block_size = 512;}
21
- else if (<parallelism> >= 64*256) { bones_block_size = 256;}
22
- else if (<parallelism> >= 64*128) { bones_block_size = 128;}
23
- else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
24
- else { bones_block_size = 32; }
20
+ if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
21
+ else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
22
+ else { bones_block_size = 128; }
25
23
  dim3 bones_threads(bones_block_size);
26
24
  dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
27
- bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
25
+ bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads, 0, kernel_stream >>>(<names>, <argument_name>);
28
26
  }
@@ -19,12 +19,13 @@ N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :defa
19
19
  D|chunk(D)+ -> D|element+ :default :00
20
20
  D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
21
21
  D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
22
- N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
23
- D|neighbourhood(D)+ -> D|element+ :default :00
24
- D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
22
+ D|neighbourhood(D)+ -> D|element+ :default :40
23
+ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :40
25
24
  D|element+ -> D|chunk(D)+ :default :00
26
- D|element+ -> D|element+ :default :00
25
+ D|element+ -> D|element+ :default :40
27
26
  D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
28
27
  D|element+ -> D|shared+ :default :08
29
28
  D|element+ -> D|element+ ^ D|shared+ :default :08
30
- D|void -> D|element+ :default :00
29
+ D|void -> D|element+ :default :40
30
+
31
+ N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
@@ -42,26 +42,30 @@ int main(void) {
42
42
  for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
43
43
 
44
44
  // Perform the computation (E := alpha*A*B*C + beta*D)
45
- #pragma species kernel 0:NI-1,0:NK-1|chunk(0:0,0:NK-1) ^ 0:NK-1,0:NJ-1|chunk(0:NK-1,0:0) -> 0:NI-1,0:NJ-1|element
46
- for (i=0; i<NI; i++) {
47
- for (j=0; j<NJ; j++) {
48
- tmp[i][j] = 0;
49
- for (k=0; k<NK; k++) {
50
- tmp[i][j] += alpha * A[i][k] * B[k][j];
45
+ #pragma scop
46
+ {
47
+ #pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> tmp[0:NI-1,0:NJ-1]|element
48
+ for (i = 0; i < NI; i++) {
49
+ for (j = 0; j < NJ; j++) {
50
+ tmp[i][j] = 0;
51
+ for (k = 0; k < NK; k++) {
52
+ tmp[i][j] += alpha * A[i][k] * B[k][j];
53
+ }
51
54
  }
52
55
  }
53
- }
54
- #pragma species endkernel 2mm-part1
55
- #pragma species kernel 0:NI-1,0:NL-1|element ^ 0:NI-1,0:NJ-1|chunk(0:0,0:NJ-1) ^ 0:NJ-1,0:NL-1|chunk(0:NJ-1,0:0) -> 0:NI-1,0:NL-1|element
56
- for (i=0; i<NI; i++) {
57
- for (j=0; j<NL; j++) {
58
- D[i][j] *= beta;
59
- for (k=0; k<NJ; k++) {
60
- D[i][j] += tmp[i][k] * C[k][j];
56
+ #pragma species endkernel 2mm_k1
57
+ #pragma species kernel D[0:NI-1,0:NL-1]|element ^ tmp[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ C[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> D[0:NI-1,0:NL-1]|element
58
+ for (i = 0; i < NI; i++) {
59
+ for (j = 0; j < NL; j++) {
60
+ D[i][j] *= beta;
61
+ for (k = 0; k < NJ; k++) {
62
+ D[i][j] += tmp[i][k] * C[k][j];
63
+ }
61
64
  }
62
65
  }
66
+ #pragma species endkernel 2mm_k2
63
67
  }
64
- #pragma species endkernel 2mm-part2
68
+ #pragma endscop
65
69
 
66
70
  // Clean-up and exit the function
67
71
  fflush(stdout);
@@ -0,0 +1,82 @@
1
+ //
2
+ // This file is part of the Bones source-to-source compiler examples. The C-code
3
+ // is largely identical in terms of functionality and variable naming to the code
4
+ // found in PolyBench/C version 3.2. For more information on PolyBench/C or Bones
5
+ // please use the contact information below.
6
+ //
7
+ // == More information on PolyBench/C
8
+ // Contact............Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
9
+ // Web address........http://polybench.sourceforge.net/
10
+ //
11
+ // == More information on Bones
12
+ // Contact............Cedric Nugteren <c.nugteren@tue.nl>
13
+ // Web address........http://parse.ele.tue.nl/bones/
14
+ //
15
+ // == File information
16
+ // Filename...........benchmark/3mm.c
17
+ // Author.............Cedric Nugteren
18
+ // Last modified on...03-April-2012
19
+ //
20
+
21
+ #include "common.h"
22
+
23
+ // This is '3mm', a 3 matrix multiply kernel
24
+ int main(void) {
25
+ int i,j,k;
26
+
27
+ // Declare arrays on the stack
28
+ float A[NI][NK];
29
+ float B[NK][NJ];
30
+ float C[NJ][NM];
31
+ float D[NM][NL];
32
+ float E[NI][NJ];
33
+ float F[NJ][NL];
34
+ float G[NI][NL];
35
+
36
+ // Set the input data
37
+ for (i=0; i<NI; i++) { for (j=0; j<NK; j++) { A[i][j] = ((float) i*j) / NI; } }
38
+ for (i=0; i<NK; i++) { for (j=0; j<NJ; j++) { B[i][j] = ((float) i*(j+1)) / NJ; } }
39
+ for (i=0; i<NL; i++) { for (j=0; j<NJ; j++) { C[i][j] = ((float) i*(j+3)) / NL; } }
40
+ for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
41
+
42
+ // Perform the computation (G := E*F, with E := A*B and F := C*D)
43
+ #pragma scop
44
+ {
45
+ #pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> E[0:NI-1,0:NJ-1]|element
46
+ for (i = 0; i < NI; i++) {
47
+ for (j = 0; j < NJ; j++) {
48
+ E[i][j] = 0;
49
+ for (k = 0; k < NK; k++) {
50
+ E[i][j] += A[i][k] * B[k][j];
51
+ }
52
+ }
53
+ }
54
+ #pragma species endkernel 3mm_k1
55
+ #pragma species kernel C[0:NJ-1,0:NM-1]|chunk(0:0,0:NM-1) ^ D[0:NM-1,0:NL-1]|chunk(0:NM-1,0:0) -> F[0:NJ-1,0:NL-1]|element
56
+ for (i = 0; i < NJ; i++) {
57
+ for (j = 0; j < NL; j++) {
58
+ F[i][j] = 0;
59
+ for (k = 0; k < NM; k++) {
60
+ F[i][j] += C[i][k] * D[k][j];
61
+ }
62
+ }
63
+ }
64
+ #pragma species endkernel 3mm_k2
65
+ #pragma species kernel E[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ F[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> G[0:NI-1,0:NL-1]|element
66
+ for (i = 0; i < NI; i++) {
67
+ for (j = 0; j < NL; j++) {
68
+ G[i][j] = 0;
69
+ for (k = 0; k < NJ; k++) {
70
+ G[i][j] += E[i][k] * F[k][j];
71
+ }
72
+ }
73
+ }
74
+ #pragma species endkernel 3mm_k3
75
+ }
76
+ #pragma endscop
77
+
78
+ // Clean-up and exit the function
79
+ fflush(stdout);
80
+ return 0;
81
+ }
82
+