bones-compiler 1.1.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (211) hide show
  1. checksums.yaml +15 -0
  2. data/CHANGELOG +37 -0
  3. data/LICENSE +1 -1
  4. data/README.rdoc +95 -70
  5. data/Rakefile +78 -3
  6. data/VERSION +1 -1
  7. data/bin/adarwin +17 -0
  8. data/examples/benchmarks/PolyBench/2mm.c +104 -0
  9. data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
  10. data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
  11. data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
  12. data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
  13. data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
  14. data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
  15. data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
  16. data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
  17. data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
  18. data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
  19. data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
  20. data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
  21. data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
  22. data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
  23. data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
  24. data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
  25. data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
  26. data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
  27. data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
  28. data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
  29. data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
  30. data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
  31. data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
  32. data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
  33. data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
  34. data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
  35. data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
  36. data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
  37. data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
  38. data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
  39. data/examples/benchmarks/Rodinia/cfd.c +180 -0
  40. data/examples/benchmarks/Rodinia/hotspot.c +228 -0
  41. data/examples/benchmarks/Rodinia/kmeans.c +164 -0
  42. data/examples/benchmarks/Rodinia/srad.c +188 -0
  43. data/examples/benchmarks/other/common.h +0 -0
  44. data/examples/benchmarks/other/dct.c +58 -0
  45. data/examples/benchmarks/other/mm.c +50 -0
  46. data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
  47. data/examples/chunk/{example1.c → example01.c} +0 -0
  48. data/examples/chunk/{example2.c → example02.c} +0 -0
  49. data/examples/chunk/{example3.c → example03.c} +0 -0
  50. data/examples/chunk/{example4.c → example04.c} +0 -0
  51. data/examples/chunk/{example5.c → example05.c} +0 -0
  52. data/examples/chunk/example06.c +45 -0
  53. data/examples/chunk/example07.c +49 -0
  54. data/examples/dependences/example01.c +42 -0
  55. data/examples/dependences/example02.c +40 -0
  56. data/examples/dependences/example03.c +43 -0
  57. data/examples/dependences/example04.c +44 -0
  58. data/examples/dependences/example05.c +42 -0
  59. data/examples/element/{example1.c → example01.c} +0 -0
  60. data/examples/element/{example2.c → example02.c} +2 -2
  61. data/examples/element/{example3.c → example03.c} +0 -0
  62. data/examples/element/{example4.c → example04.c} +0 -0
  63. data/examples/element/{example5.c → example05.c} +0 -0
  64. data/examples/element/{example6.c → example06.c} +0 -0
  65. data/examples/element/{example7.c → example07.c} +0 -0
  66. data/examples/element/{example8.c → example08.c} +0 -0
  67. data/examples/element/{example9.c → example09.c} +0 -0
  68. data/examples/element/example13.c +73 -0
  69. data/examples/fusion/example01.c +68 -0
  70. data/examples/fusion/example02.c +73 -0
  71. data/examples/fusion/example03.c +72 -0
  72. data/examples/fusion/example04.c +61 -0
  73. data/examples/fusion/example05.c +55 -0
  74. data/examples/neighbourhood/{example1.c → example01.c} +0 -0
  75. data/examples/neighbourhood/{example2.c → example02.c} +0 -0
  76. data/examples/neighbourhood/{example3.c → example03.c} +0 -0
  77. data/examples/neighbourhood/{example4.c → example04.c} +0 -0
  78. data/examples/neighbourhood/example05.c +44 -0
  79. data/examples/shared/{example1.c → example01.c} +0 -0
  80. data/examples/shared/{example2.c → example02.c} +0 -0
  81. data/examples/shared/{example3.c → example03.c} +0 -0
  82. data/examples/shared/{example4.c → example04.c} +0 -0
  83. data/examples/shared/{example5.c → example05.c} +0 -0
  84. data/lib/adarwin.rb +62 -0
  85. data/lib/adarwin/dependences.rb +268 -0
  86. data/lib/adarwin/engine.rb +277 -0
  87. data/lib/adarwin/fusion.rb +174 -0
  88. data/lib/adarwin/interval.rb +57 -0
  89. data/lib/adarwin/memorycopies.rb +153 -0
  90. data/lib/adarwin/nest.rb +225 -0
  91. data/lib/adarwin/preprocessor.rb +76 -0
  92. data/lib/adarwin/reference.rb +261 -0
  93. data/lib/bones.rb +4 -55
  94. data/lib/bones/algorithm.rb +77 -40
  95. data/lib/bones/copy.rb +26 -0
  96. data/lib/bones/engine.rb +147 -31
  97. data/lib/bones/preprocessor.rb +92 -12
  98. data/lib/bones/species.rb +4 -3
  99. data/lib/bones/structure.rb +14 -4
  100. data/lib/castaddon.rb +11 -6
  101. data/lib/castaddon/node_adarwin.rb +245 -0
  102. data/lib/castaddon/node_bones.rb +316 -0
  103. data/lib/castaddon/node_common.rb +289 -0
  104. data/lib/castaddon/transformations.rb +236 -0
  105. data/lib/common.rb +216 -0
  106. data/skeletons/CPU-C/common/header.c +3 -0
  107. data/skeletons/CPU-C/common/mem_global.c +0 -0
  108. data/skeletons/CPU-C/common/timer_2_start.c +11 -13
  109. data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
  110. data/skeletons/CPU-C/common/timer_globals.c +29 -0
  111. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
  112. data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
  114. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
  115. data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
  117. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
  118. data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
  119. data/skeletons/CPU-OPENMP/common/globals.c +1 -0
  120. data/skeletons/CPU-OPENMP/common/header.c +3 -0
  121. data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
  122. data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
  123. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
  124. data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
  125. data/skeletons/GPU-CUDA/common/globals.c +27 -3
  126. data/skeletons/GPU-CUDA/common/header.c +2 -0
  127. data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
  128. data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
  129. data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
  130. data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
  131. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
  132. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
  133. data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
  134. data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
  135. data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
  136. data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
  137. data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
  138. data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
  139. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
  140. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
  141. data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
  142. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
  143. data/skeletons/GPU-CUDA/skeletons.txt +6 -5
  144. data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
  145. data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
  146. data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
  147. data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
  148. data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
  149. data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
  150. data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
  151. data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
  152. data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
  153. data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
  154. data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
  155. data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
  156. data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
  157. data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
  158. data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
  159. data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
  160. data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
  161. data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
  162. data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
  163. data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
  164. data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
  165. data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
  166. data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
  167. data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
  168. data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
  169. data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
  170. data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
  171. data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
  172. data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
  173. data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
  174. data/test/examples/chunk/example01_species.c +58 -0
  175. data/test/examples/chunk/example02_species.c +48 -0
  176. data/test/examples/chunk/example03_species.c +63 -0
  177. data/test/examples/chunk/example04_species.c +58 -0
  178. data/test/examples/chunk/example05_species.c +56 -0
  179. data/test/examples/chunk/example06_species.c +49 -0
  180. data/test/examples/chunk/example07_species.c +53 -0
  181. data/test/examples/dependences/example01_species.c +46 -0
  182. data/test/examples/dependences/example02_species.c +44 -0
  183. data/test/examples/dependences/example03_species.c +47 -0
  184. data/test/examples/dependences/example04_species.c +48 -0
  185. data/test/examples/dependences/example05_species.c +46 -0
  186. data/test/examples/element/example01_species.c +50 -0
  187. data/test/examples/element/example02_species.c +50 -0
  188. data/test/examples/element/example03_species.c +62 -0
  189. data/test/examples/element/example04_species.c +53 -0
  190. data/test/examples/element/example05_species.c +59 -0
  191. data/test/examples/element/example06_species.c +50 -0
  192. data/test/examples/element/example07_species.c +58 -0
  193. data/test/examples/element/example08_species.c +49 -0
  194. data/test/examples/element/example09_species.c +52 -0
  195. data/test/examples/element/example10_species.c +54 -0
  196. data/test/examples/element/example11_species.c +51 -0
  197. data/test/examples/element/example12_species.c +60 -0
  198. data/test/examples/element/example13_species.c +77 -0
  199. data/test/examples/neighbourhood/example01_species.c +57 -0
  200. data/test/examples/neighbourhood/example02_species.c +56 -0
  201. data/test/examples/neighbourhood/example03_species.c +83 -0
  202. data/test/examples/neighbourhood/example04_species.c +55 -0
  203. data/test/examples/neighbourhood/example05_species.c +48 -0
  204. data/test/examples/shared/example01_species.c +49 -0
  205. data/test/examples/shared/example02_species.c +55 -0
  206. data/test/examples/shared/example03_species.c +59 -0
  207. data/test/examples/shared/example04_species.c +56 -0
  208. data/test/examples/shared/example05_species.c +52 -0
  209. metadata +193 -73
  210. data/examples/benchmarks/overview.txt +0 -38
  211. data/lib/castaddon/node.rb +0 -753
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NzNkZmYxNTA0YzhiYmU3NmVhY2QxYjU2MTNjNzIzOGQ3MzYwMjA1Yw==
5
+ data.tar.gz: !binary |-
6
+ ZjFjMzJmMjUzMDg5Y2UyZjRhZDU0MjgyMzkwY2NlM2EzYmE5OWJkZA==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ OWU3ODM5MzVhZjE1ZTY5M2VhYzdkNmE0MDQwNDZkNzRjNTE4NWQyMDk5N2Uw
10
+ NWM2NTM3NWFkNmMyYjM2M2NiYzVlOWZhZTQ2MDgwNzVmYzU0NzYxOGM0MDNj
11
+ YzljMDkzMTZiNDgxMjUzZWU4YzA4M2RiYTEyMGFlMGM4ZWI1OWQ=
12
+ data.tar.gz: !binary |-
13
+ MmI1YjBiOGUwMDJiN2MxNTE1MDgwOGExMTc1MjkxMWM3NzdmODFjNzk5MGY3
14
+ MGIzODVmMzExNTQ2MDc1OTA4M2E2NDc4YWNjYjEwYjRkMGJhNzM4OTZmMmZh
15
+ YTI0M2RkN2U1YWEyYWI4MTllYzgxOTQ5ODVjMjk3ZWEwOTc4NmU=
data/CHANGELOG CHANGED
@@ -1,8 +1,45 @@
1
+ ###################
2
+ ### v1.3 ###
3
+ ###################
4
+
5
+ General:
6
+ - Implemented kernel fusion in A-Darwin/Bones
7
+ - Added command-line options to test individual optimisations
8
+ - Enabled thread-merging (coarsening) for additional skeletons
9
+
10
+ Skeletons:
11
+ - Optimised the CUDA skeletons for threadblock sizing
12
+ - Fixed various bugs in the CPU-targets
13
+
14
+ Bug fixes:
15
+ - Synchronisation statements are now correctly inserted as pragma's
16
+ - Re-enabled the memory optimisations (-rfbl)
17
+ - Minor bug fixes in A-Darwin
18
+
19
+ ###################
20
+ ### v1.2 ###
21
+ ###################
22
+
23
+ General:
24
+ - Integrated A-Darwin (a tool to automatically extract algorithmic species) into the Bones source-tree.
25
+ - Added memory transfer optimisations (e.g. CPU-GPU copy optimisations).
26
+
27
+ A-Darwin:
28
+ - Initial test release of A-Darwin (with many limitations)
29
+ - Added a 'golden' reference set to test the extraction of species
30
+ - Added dependence-test examples
31
+
32
+ Miscellaneous:
33
+ - Substituted the simplify method by a symbolic math library gem ('symbolic').
34
+ - Updated the readme file
35
+ - Updated the examples
36
+
1
37
  ###################
2
38
  ### v1.1 ###
3
39
  ###################
4
40
 
5
41
  General:
42
+ - Initial release to RubyGems and initial commit to github.
6
43
  - Added support for a compiler optimisation pass: array substitution with a local register copy in the case of chunk to element species.
7
44
  - Added support for a compiler optimisation pass: thread-merging, potentially improving re-use through locality at the cost of parallelism.
8
45
  - Updated and added examples.
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Cedric Nugteren, Eindhoven University of Technology, The Netherlands
1
+ Copyright (c) 2013 Cedric Nugteren, Eindhoven University of Technology, The Netherlands
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
4
 
data/README.rdoc CHANGED
@@ -1,4 +1,11 @@
1
+
2
+ = Bones and A-Darwin
3
+ This README covers both the source-to-source compiler Bones and the species extraction tool A-Darwin. Please refer to the corresponding sections for documentation.
4
+
5
+
1
6
  = Bones
7
+
8
+ == Introduction
2
9
  Recent advances in multi-core and many-core processors requires programmers to exploit an increasing amount of parallelism from their applications. Data parallel languages such as CUDA and OpenCL make it possible to take advantage of such processors, but still require a large amount of effort from programmers. To address the challenge of parallel programming, we introduce Bones.
3
10
 
4
11
  Bones is a source-to-source compiler based on algorithmic skeletons and a new algorithm classification (named 'algorithmic species'). The compiler takes C-code annotated with class information as input and generates parallelized target code. Targets include NVIDIA GPUs (through CUDA), AMD GPUs (through OpenCL) and CPUs (through OpenCL and OpenMP). Bones is open-source, written in the Ruby programming language, and is available through our website. The compiler is based on the C-parser CAST, which is used to parse the input code into an abstract syntax tree (AST) and to generate the target code from a transformed AST.
@@ -7,53 +14,26 @@ Bones is a source-to-source compiler based on algorithmic skeletons and a new al
7
14
  The usage is as follows:
8
15
  bones --application <input> --target <target> [OPTIONS]
9
16
  With the following flags:
10
- --application, -a <s>: Input application file
11
- --target, -t <s>: Target processor (choose from: GPU-CUDA, GPU-OPENCL-AMD,
12
- CPU-OPENCL-INTEL, CPU-OPENCL-AMD, CPU-OPENMP, CPU-C)
13
- --measurements, -m: Enable/disable timers
14
- --verify, -c: Verify correctness of the generated code
15
- --version, -v: Print version and exit
16
- --help, -h: Show this message
17
+ --application, -a <s>: Input application file
18
+ --target, -t <s>: Target processor (choose from: CPU-C, CPU-OPENCL-AMD,
19
+ CPU-OPENCL-INTEL, CPU-OPENMP,GPU-CUDA, GPU-OPENCL-AMD)
20
+ --measurements, -m: Enable/disable timers
21
+ --verify, -c: Verify correctness of the generated code
22
+ --only-alg-number, -o <i>: Only generate code for the x-th species (99 -> all)
23
+ --merge-factor, -f <i>: Thread merge factor, default is 1 (==disabled)
24
+ --register-caching, -r <i>: Enable register caching: 1:enabled (default), 0:disabled
25
+ --zero-copy, -z <i>: Enable OpenCL zero-copy: 1:enabled (default), 0:disabled
26
+ --skeletons, -s <i>: Enable non-default skeletons: 1:enabled (default), 0:disabled
27
+ --version, -v: Print version and exit
28
+ --help, -h: Show this message
17
29
 
18
30
  Bones can be invoked from the command-line. Two arguments (-a and -t) are mandatory, others are optional. This is an example of the usage of Bones assuming the file '+example.c+' to be present:
19
31
  bones -a example.c -t GPU-CUDA -c
20
32
 
21
33
  == Examples
22
- The best place to start experimenting with Bones is the '+examples+' directory. A large number of examples are available in this folder, grouped by algorithmic species (either element, neighbourhood, shared or chunk). The examples illustrate different kinds of coding styles and give a large number of different classes to work with. The folder '+benchmarks+' gives more examples, taken from the PolyBench/C benchmark set. Additionally, a folder '+applications+' is included, containing example complete applications. Currently, the following examples are available:
23
- |-- element |-- applications
24
- | |-- example1.c | \-- ffos.c
25
- | |-- example2.c \-- benchmarks
26
- | |-- example3.c |-- 2mm.c
27
- | |-- example4.c |-- 3mm.c
28
- | |-- example5.c |-- adi.c
29
- | |-- example6.c |-- atax.c
30
- | |-- example7.c |-- bicg.c
31
- | |-- example8.c |-- cholesky.c
32
- | |-- example9.c |-- correlation.c
33
- | |-- example10.c |-- covariance.c
34
- | |-- example11.c |-- doitgen.c
35
- | \-- example12.c |-- durbin.c
36
- |-- neighbourhood |-- dynprog.c
37
- | |-- example1.c |-- fdtd-2d-apml.c
38
- | |-- example2.c |-- fdtd-2d.c
39
- | |-- example3.c |-- floyd-warshall.c
40
- | \-- example4.c |-- gemm.c
41
- |-- shared |-- gemver.c
42
- | |-- example1.c |-- gesummv.c
43
- | |-- example2.c |-- jacobi-1d-imper.c
44
- | |-- example3.c |-- jacobi-2d-imper.c
45
- | |-- example4.c |-- lu.c
46
- | \-- example5.c |-- ludcmp.c
47
- |-- chunk |-- mvt.c
48
- | |-- example1.c |-- reg_detect.c
49
- | |-- example2.c |-- saxpy.c
50
- | |-- example3.c |-- seidel-2d.c
51
- | |-- example4.c |-- syr2k.c
52
- | \-- example5.c |-- syrk.c
53
- |-- trisolv.c
54
- \-- trmm.c
55
-
56
- All examples can be ran through Bones for a specific target using an automated Rake task. Executing '<tt>rake examples:generate</tt>' or simply '+rake+' will execute Bones for all examples for a given target. The target can be changed in the '+Rakefile+' found in the root directory of Bones.
34
+ The best place to start experimenting with Bones is the '+examples+' directory. A large number of examples are available in this folder, grouped by algorithmic species (either element, neighbourhood, shared or chunk). The examples illustrate different kinds of coding styles and give a large number of different classes to work with. The folder '+benchmarks+' gives more examples, taken from the PolyBench/C benchmark set. Additionally, a folder '+applications+' is included, containing example complete applications.
35
+
36
+ All examples can be run through Bones for a specific target using an automated Rake task. Executing '<tt>rake examples:generate</tt>' or simply '+rake+' will execute Bones for all examples for a given target. The target can be changed in the '+Rakefile+' found in the root directory of Bones.
57
37
 
58
38
  == Limitations
59
39
  Bones takes C99 source code as input. However, several coding styles are unsupported as of now or might yield worse performance compared to others. The numerous examples provided should give the user an idea of the possibilities and limitations of the tool. A complete list of coding guidelines and limitations will follow in the future. Currently, an initial list of major limitations and guidelines is given below. In this list, we use '+algorithm+' to denote an algorithm captured by an algorithmic species.
@@ -65,12 +45,47 @@ Bones takes C99 source code as input. However, several coding styles are unsuppo
65
45
  * Bones is designed to work on a single input file with at least a function called 'main'. If your (to-be-accelerated) code spawns over multiple C-files, Bones could either be applied multiple times, or the code could be merged into a single file.
66
46
 
67
47
 
48
+ = A-Darwin
49
+
50
+ == Introduction
51
+ The original algorithmic species theory included ASET, a polyhedral based algorithmic species extraction tool. Along with a new non-polyhedral theory, we present a new automatic extraction tool named A-Darwin (short for `automatic Darwin').
52
+
53
+ The new tool is largely equal to ASET in terms of functionality, but is different internally. The tool is based on CAST, a C99 parser which allows analysis on an abstract syntax tree (AST). From the AST, the tool extracts the array references and constructs a 5 or 6-tuple for each loop nest. Following, merging is applied and the species are extracted. Finally, the species are inserted as pragma's in the original source code. To perform the dependence tests in A-Darwin, we make use of a combination of the GCD and Banerjee tests. Together, these tests are conservative, i.e. we might not find all species.
54
+
55
+ == Usage
56
+ The usage is as follows:
57
+ adarwin --application <input> [OPTIONS]
58
+ With the following flags:
59
+ --application, -a <s>: Input application file
60
+ --no-memory-annotations, -m: Disable the printing of memory annotations
61
+ --mem-remove-spurious, -r: Memcopy optimisation: remove spurious copies
62
+ --mem-copyin-to-front, -f: Memcopy optimisation: move copyins to front
63
+ --mem-copyout-to-back, -b: Memcopy optimisation: move copyouts to back
64
+ --mem-to-outer-loop, -l: Memcopy optimisation: move copies to outer loops
65
+ --fusion, -k <i>: Type of kernel fusion to perform (0 -> disable)
66
+ --print-arc, -c: Print array reference characterisations (ARC) instead of species
67
+ --silent, -s: Become silent (no message printing)
68
+ --only-alg-number, -o <i>: Only generate code for the x-th species (99 -> all)
69
+ --version, -v: Print version and exit
70
+ --help, -h: Show this message
71
+
72
+ A-Darwin can be invoked from the command-line. One arguments (-a) is mandatory, others are optional. This is an example of the usage of A-Darwin assuming the file '+example.c+' to be present:
73
+ adarwin -a example.c -m -s
74
+
75
+ For now, it is recommended to use the '-m' flag. The memory optimisation flags ('-rfbl') are not fully tested yet. For a more fine-grained classification, A-Darwin is able to print the internal array reference characterisations (ARC) instead (use the '-c' flag).
76
+
77
+ == Known limitations
78
+ * The dependence test is not reliable yet
79
+ * Code similar to the failing examples are not supported yet
80
+ * Multi-line comments with pre-processor directives inside will not be considered commented out.
81
+ * Custom defined types are not supported. Apart from the default C99 types, FILE and size_t are supported.
82
+
68
83
 
69
84
  = Installation procedure
70
- Installation of Bones is a simple matter of extracting the Bones package to a directory of your choice or installing the gem ('<tt>gem install bones-compiler</tt>'). However, there are a number of prerequisites.
85
+ Installation of Bones and A-Darin is a simple matter of extracting the Bones/A-Darwin package to a directory of your choice. Bones can also be installed as a gem ('<tt>gem install bones-compiler</tt>'). However, there are a number of prerequisites before doing this.
71
86
 
72
87
  == Prerequisites
73
- Bones requires the installation of Ruby, the Rubygems gem package manager and two gems:
88
+ Bones/A-Darwin requires the installation of Ruby, the Rubygems gem package manager and several gems:
74
89
  1. Any version of *Ruby* *1.8* or *1.9*. Information on Ruby is found at http://www.ruby-lang.org
75
90
  * [OS X]: Ruby is pre-installed on any OS X system since Tiger (10.4).
76
91
  * [Linux]: Ruby is pre-installed on some Linux based systems. Most Linux package managers (yum, apt-get) will be able to provide a Ruby installation. Make sure that the ruby development package ('+ruby-devel+') is also installed, as it is required by one of the gems.
@@ -79,48 +94,58 @@ Bones requires the installation of Ruby, the Rubygems gem package manager and tw
79
94
  * [OS X]: Rubygems is pre-installed on any OS X system since Tiger (10.4).
80
95
  * [Linux]: Most Linux package managers will be able to provide a Rubygems installation by installing the package '+rubygems+'.
81
96
  * [Windows]: Rubygems for Windows is obtained automatically when installing from http://rubyinstaller.org/
82
- 3. Bones requires two gems, *trollop* and *cast*. Both gems can be installed by calling Rubygems from the command line, i.e.: '<tt>gem install trollop cast</tt>'.
97
+ 3. Bones/A-Darwin require the gems, *trollop*, *cast*, and *symbolic*. These gems can be installed by calling Rubygems from the command line, i.e.: '<tt>gem install trollop cast symbolic</tt>'.
83
98
 
84
99
  For example, all prerequisites can be installed as follows on a Fedora, Red-Hat or CentOS system:
85
100
  yum install ruby ruby-devel rubygems
86
- gem install trollop cast
101
+ gem install trollop cast symbolic
87
102
  For an Ubuntu, Debian or Mint system, the equivalent commands are:
88
103
  apt-get install ruby ruby-devel rubygems
89
- gem install trollop cast
104
+ gem install trollop cast symbolic
90
105
 
91
- == Installing Bones
92
- To install the compiler, simply extract the '<tt>bones\_x.x.tar.gz</tt>' package to a directory of your choice. The Bones executable is found in the '+bin+' subdirectory. Including the path to the '+bin+' directory to your environmental variable '+PATH+' will make Bones available from any directory on your machine. Starting at version 1.1, Bones is also available as a gem ('<tt>gem install bones-compiler</tt>').
106
+ == Installing Bones/A-Darwin manually
107
+ To install the tools manually, simply extract the '<tt>bones\_x.x.tar.gz</tt>' or '<tt>adarwin\_x.x.tar.gz</tt>' package into a directory of your choice. The Bones/A-Darwin executables are found in the '+bin+' subdirectory. Including the path to the '+bin+' directory to your environmental variable '+PATH+' will make Bones/A-Darwin available from any directory on your machine. Starting at version 1.1, Bones and A-Darwin are also available as a gem ('<tt>gem install bones-compiler</tt>').
93
108
 
94
109
 
95
110
  = Documentation
96
- There are two ways to go to obtain more information regarding Bones. To obtain more information about the compiler itself, the ideas behind it and the algorithm classification, it is a good idea to read scientific publications. To get more information about the code structure, HTML documentation can be generated automatically using RDoc.
111
+ There are two ways to go to obtain more information regarding Bones/A-Darwin. To obtain more information about the tools themselves, the ideas behind it and the algorithm classification, it is a good idea to read scientific publications. To get more information about the code structure, HTML documentation can be generated automatically using RDoc.
97
112
 
98
113
  == Code documentation
99
- Code documentation can be generated automatically using RDoc. Navigate to the installation root of Bones and use Rake to generate documentation: '<tt>rake rdoc</tt>'. More information on using Rake is provided later in this document. Next, open '<tt>rdoc/index.html</tt>' to navigate through the documentation. The same documentation is also available on the web at http://parse.ele.tue.nl/tools/bones/rdoc/.
114
+ Code documentation can be generated automatically using RDoc. Navigate to the installation root of Bones/A-Darwin and use Rake to generate documentation: '<tt>rake rdoc</tt>'. More information on using Rake is provided later in this document. Next, open '<tt>rdoc/index.html</tt>' to navigate through the documentation. The same documentation is also available on the web at http://parse.ele.tue.nl/tools/bones/rdoc/.
100
115
 
101
116
  == Scientific publications
102
- Scientific publications related to Bones can be obtained from http://parse.ele.tue.nl/publications. Two publications are relevant:
103
- 1. <b>A Modular and Parameterisable Classification of Algorithms</b>, which provides details on the used algorithm classification. When refering to the algorithm classification in scientific work, you are kindly asked to include the following citations:
104
-
105
- @TECHREPORT{Nugteren2011,
106
- author = {Cedric Nugteren and Henk Corporaal},
107
- title = {{A Modular and Parameterisable Classification of Algorithms}},
108
- institution = {Eindhoven University of Technology},
109
- year = {2011},
110
- number = {No. ESR-2011-02},
117
+ Scientific publications related to Bones/A-Darwin can be obtained from http://www.cedricnugteren.nl/publications. Several publications are relevant:
118
+
119
+ 1. <b>Algorithmic Species Revisited: A Program Code Classification Based on Array References</b>, which provides details on the algorithm classification (the species) and A-Darwin (the tool). When refering to the algorithm classification in scientific work, you are kindly asked to include the following citation:
120
+
121
+ @INPROCEEDINGS{Nugteren2013a,
122
+ author = {Cedric Nugteren and Rosilde Corvino and Henk Corporaal},
123
+ title = {Algorithmic Species Revisited: A Program Code Classification Based on Array References},
124
+ booktitle = {MuCoCoS '13: International Workshop on Multi-/Many-core Computing Systems},
125
+ year = {2013},
111
126
  }
112
- 2. <b>Introducing 'Bones': A Parallelizing Source-to-Source Compiler Based on Algorithmic Skeletons</b>, which introduces the Bones source-to-source compiler. When refering to Bones in scientific work, you are kindly asked to include the following citations:
113
-
114
- @INPROCEEDINGS{Nugteren2012,
115
- author = {Cedric Nugteren and Henk Corporaal},
116
- title = {{Introducing `Bones': A Parallelizing Source-to-Source Compiler
117
- Based on Algorithmic Skeletons}},
118
- booktitle = {{GPGPU-5: 5th Workshop on General Purpose Processing on
119
- Graphics Processing Units}},
120
- year = {2012},
127
+
128
+ 2. <b>Automatic Skeleton-Based Compilation through Integration with an Algorithm Classification</b>, which discusses the Bones source-to-source compiler. When refering to Bones in scientific work, you are kindly asked to include the following citation:
129
+
130
+ @INPROCEEDINGS{Nugteren2013b,
131
+ author = {Cedric Nugteren and Pieter Custers and Henk Corporaal},
132
+ title = {Automatic Skeleton-Based Compilation through Integration with an Algorithm Classification},
133
+ booktitle = {APPT '13: Advanced Parallel Processing Technology},
134
+ year = {2013},
121
135
  }
122
136
 
123
137
 
138
+ = Rake
139
+ Rake is Ruby's make and can be used to automate tasks. By invoking '<tt>rake -T</tt>', a list of commands will become available. For example, for A-Darwin, the following rake commands are available:
140
+ rake adarwin[file] # Extract species descriptions using A-Darwin
141
+ rake adarwin_test # Test A-Darwin`s output against golden samples
142
+ rake clean # Remove any temporary products.
143
+ rake clobber # Remove any generated file.
144
+ rake clobber_rdoc # Remove RDoc HTML files
145
+ rake rdoc # Build RDoc HTML files
146
+ rake rerdoc # Rebuild RDoc HTML files
147
+ With rake, A-Darwin can be tested on a set of examples '<tt>rake adarwin_test</tt>'. Pre-created golden samples are available in the '+test+' folder.
148
+
124
149
 
125
150
  = Questions
126
- Questions can be directed by email. You can find contact details on the personal page of the author at http://parse.ele.tue.nl/cnugteren or on the project page at github.
151
+ Questions can be directed by email. You can find contact details on the personal page of the author at http://www.cedricnugteren.nl/ or http://parse.ele.tue.nl/cnugteren/ or on the project page at github.
data/Rakefile CHANGED
@@ -6,7 +6,27 @@ require 'rake/clean'
6
6
  EXAMPLES = File.join('examples','benchmarks','*.c')
7
7
 
8
8
  # Set the clean/clobber tasks
9
- CLOBBER.include(Dir[File.join('examples','*','*_*-*')])
9
+ CLOBBER.include(Dir[
10
+ File.join('examples','*_species.c'),
11
+ File.join('examples','*','*_*-*'),
12
+ File.join('examples','*','*_species.c'),
13
+ File.join('examples','benchmarks','*','*_*-*'),
14
+ File.join('examples','benchmarks','*','*_species.c')
15
+ ])
16
+
17
+ # Set the location of the examples
18
+ ADARWIN_EXAMPLES_ALL = [
19
+ File.join('examples','element','*.c'),
20
+ File.join('examples','chunk','*.c'),
21
+ File.join('examples','neighbourhood','*.c'),
22
+ File.join('examples','shared','*.c'),
23
+ File.join('examples','dependences','*.c'),
24
+ File.join('examples','benchmarks','PolyBench','*.c')
25
+ ]
26
+ # Select PolyBench as the set of examples
27
+ ADARWIN_EXAMPLES = ADARWIN_EXAMPLES_ALL[5]
28
+ ADARWIN_MEMORY = false unless defined?(ADARWIN_MEMORY)
29
+ ADARWIN_OPTIONS = ADARWIN_MEMORY ? '-r -f -b -l' : '--no-memory-annotations' unless defined?(ADARWIN_OPTIONS)
10
30
 
11
31
  # Pick a target from a list of possible targets
12
32
  # 0 1 2 3 4 5
@@ -15,7 +35,9 @@ TARGET = TARGETS[0]
15
35
 
16
36
  # Settings for Bones
17
37
  MEASUREMENTS = true
18
- VERIFICATION = true
38
+ VERIFICATION = false
39
+ MEMORY_OPTIMISATIONS = true
40
+ ADARWIN_OPTIONS = MEMORY_OPTIMISATIONS ? '-r -f -b -l' : ''
19
41
 
20
42
  # Small helper function to display text on screen
21
43
  def display(text)
@@ -34,13 +56,28 @@ namespace :examples do
34
56
  task :verify, [:file] => [:generate, :compile, :execute] do |t, args|
35
57
  end
36
58
 
59
+ # Task to process and test everything through A-Darwin and Bones (generating code, compiling code, executing)
60
+ desc 'Run the examples through A-Darwin and Bones, then compile and execute them'
61
+ task :go, :file do |t, args|
62
+ bones_options = (MEASUREMENTS ? '-m ' : '') + (VERIFICATION ? '-c ' : '')
63
+ args.with_defaults(:file => EXAMPLES)
64
+ Dir[args.file].sort.each do |file|
65
+ sh "bin/adarwin -a #{file} #{ADARWIN_OPTIONS}"
66
+ split = file.split('.')
67
+ file = split[0]+'_species'+'.'+split[1]
68
+ sh "bin/bones -a #{file} -t #{TARGET} #{bones_options}"
69
+ compile(file,TARGET)
70
+ execute(file,TARGET)
71
+ end
72
+ end
73
+
37
74
  # Task to pass examples through Bones
38
75
  desc 'Generate target code using Bones'
39
76
  task :generate, :file do |t, args|
77
+ options = (MEASUREMENTS ? '-m ' : '') + (VERIFICATION ? '-c ' : '')
40
78
  args.with_defaults(:file => EXAMPLES)
41
79
  Dir[args.file].sort.each do |file|
42
80
  display('Generating')
43
- options = (MEASUREMENTS ? '-m ' : '') + (VERIFICATION ? '-c ' : '')
44
81
  sh "bin/bones -a #{file} -t #{TARGET} #{options}"
45
82
  end
46
83
  end
@@ -89,6 +126,44 @@ task :add_target, :name, :base do |t, args|
89
126
  end
90
127
  end
91
128
 
129
+ # Generate species descriptions using A-Darwin
130
+ desc 'Extract species descriptions using A-Darwin'
131
+ task :adarwin, :file do |t, args|
132
+ args.with_defaults(:file => ADARWIN_EXAMPLES)
133
+ Dir[args.file].sort.each do |file|
134
+ adarwin(file,ADARWIN_OPTIONS)
135
+ end
136
+ end
137
+
138
+ # Generate species descriptions using A-Darwin
139
+ desc 'Test A-Darwin`s output against golden samples'
140
+ task :adarwin_test do |t|
141
+ pass = 0
142
+ fail = 0
143
+ ADARWIN_EXAMPLES_ALL.each do |examples|
144
+ Dir[examples].sort.each do |file|
145
+ if !(file =~ /_species\.c/)
146
+ adarwin(file,'--no-memory-annotations')
147
+ display('Testing correctness')
148
+ speciesfile = file.gsub('.c','_species.c')
149
+ sh "diff #{speciesfile} test/#{speciesfile}" do |ok,status|
150
+ ok ? pass += 1 : fail += 1
151
+ end
152
+ end
153
+ end
154
+ end
155
+ display('Test results')
156
+ puts "PASS: #{pass}, FAIL: #{fail}"
157
+ end
158
+
159
+ # Method to run A-Darwin for a set of files
160
+ def adarwin(file,options)
161
+ if !(file =~ /_species\.c/)
162
+ display('Extracting species')
163
+ sh "bin/adarwin --application #{file} --silent #{options}"
164
+ end
165
+ end
166
+
92
167
  # Test individual parts of the code
93
168
  Rake::TestTask.new do |test|
94
169
  test.test_files = FileList[File.join('test','*','test_*.rb')]
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.1.0
1
+ 1.3.1
data/bin/adarwin ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Set the path for the libraries and the installation directory
4
+ ADARWIN_DIR = File.dirname(__FILE__) + '/../'
5
+ lib_dir = File.join(ADARWIN_DIR,'lib')
6
+ $LOAD_PATH.unshift lib_dir unless $LOAD_PATH.include?(lib_dir)
7
+
8
+ # Load the libraries
9
+ require 'castaddon.rb'
10
+ require 'adarwin.rb'
11
+
12
+ # Run the main function
13
+ adarwin = Adarwin::Engine.new
14
+ adarwin.process
15
+ adarwin.write_output
16
+
17
+
@@ -0,0 +1,104 @@
1
+ //
2
+ // This file is part of the Bones source-to-source compiler examples. The C-code
3
+ // is largely identical in terms of functionality and variable naming to the code
4
+ // found in PolyBench/C version 3.2. For more information on PolyBench/C or Bones
5
+ // please use the contact information below.
6
+ //
7
+ // == More information on PolyBench/C
8
+ // Contact............Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
9
+ // Web address........http://polybench.sourceforge.net/
10
+ //
11
+ // == More information on Bones
12
+ // Contact............Cedric Nugteren <c.nugteren@tue.nl>
13
+ // Web address........http://parse.ele.tue.nl/bones/
14
+ //
15
+ // == File information
16
+ // Filename...........benchmark/2mm.c
17
+ // Author.............Cedric Nugteren
18
+ // Last modified on...05-July-2013
19
+ //
20
+
21
+ #include "common.h"
22
+
23
+ // This is '2mm', a 2 matrix multiply kernel
24
+ int main(void) {
25
+ int i,j,k;
26
+
27
+ // Declare arrays on the stack
28
+ float A[NI][NK];
29
+ float B[NK][NJ];
30
+ float C[NJ][NL];
31
+ float D[NI][NL];
32
+ float tmp[NI][NJ];
33
+
34
+ // Set the constants
35
+ int alpha = 32412;
36
+ int beta = 2123;
37
+
38
+ // Set the input data
39
+ for (i=0; i<NI; i++) { for (j=0; j<NK; j++) { A[i][j] = ((float) i*j) / NI; } }
40
+ for (i=0; i<NK; i++) { for (j=0; j<NJ; j++) { B[i][j] = ((float) i*(j+1)) / NJ; } }
41
+ for (i=0; i<NL; i++) { for (j=0; j<NJ; j++) { C[i][j] = ((float) i*(j+3)) / NL; } }
42
+ for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
43
+
44
+ // Perform the computation (E := alpha*A*B*C + beta*D)
45
+ #pragma scop
46
+ #pragma species copyin A[0:NI-1,0:NK-1]|0 ^ B[0:NK-1,0:NJ-1]|0 ^ D[0:NI-1,0:NL-1]|1 ^ C[0:NJ-1,0:NL-1]|1
47
+ #pragma species sync 0
48
+ #pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> tmp[0:NI-1,0:NJ-1]|element
49
+ for (i=0; i<NI; i++) {
50
+ for (j=0; j<NJ; j++) {
51
+ tmp[i][j] = 0;
52
+ for (k=0; k<NK; k++) {
53
+ tmp[i][j] += alpha * A[i][k] * B[k][j];
54
+ }
55
+ }
56
+ }
57
+ #pragma species endkernel 2mm-part1
58
+ #pragma species copyout tmp[0:NI-1,0:NJ-1]|2
59
+ #pragma species sync 1
60
+ #pragma species kernel D[0:NI-1,0:NL-1]|element ^ tmp[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ C[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> D[0:NI-1,0:NL-1]|element
61
+ for (i=0; i<NI; i++) {
62
+ for (j=0; j<NL; j++) {
63
+ D[i][j] *= beta;
64
+ for (k=0; k<NJ; k++) {
65
+ D[i][j] += tmp[i][k] * C[k][j];
66
+ }
67
+ }
68
+ }
69
+ #pragma species endkernel 2mm-part2
70
+ #pragma species copyout D[0:NI-1,0:NL-1]|2
71
+ #pragma species sync 2
72
+ #pragma endscop
73
+
74
+ /*
75
+ #pragma species copyin A[0:NI-1,0:NK-1]|0 ^ B[0:NK-1,0:NJ-1]|0 ^ D[0:NI-1,0:NL-1]|0 ^ C[0:NJ-1,0:NL-1]|0
76
+ #pragma species sync 0
77
+ #pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> tmp[0:NI-1,0:NJ-1]|element
78
+ #pragma species kernel D[0:NI-1,0:NL-1]|element ^ tmp[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ C[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> D[0:NI-1,0:NL-1]|element
79
+ for (i=0; i<NI; i++) {
80
+ for (j=0; j<MAX(NJ,NL); j++) {
81
+ if (j < NJ) {
82
+ tmp[i][j] = 0;
83
+ for (k=0; k<NK; k++) {
84
+ tmp[i][j] += alpha * A[i][k] * B[k][j];
85
+ }
86
+ }
87
+ if (j < NL) {
88
+ D[i][j] *= beta;
89
+ for (k=0; k<NJ; k++) {
90
+ D[i][j] += tmp[i][k] * C[k][j];
91
+ }
92
+ }
93
+ }
94
+ }
95
+ #pragma species endkernel 2mm-fused
96
+ #pragma species copyout D[0:NI-1,0:NL-1]|2 tmp[0:NI-1,0:NJ-1]|2
97
+ #pragma species sync 2
98
+ */
99
+ // Clean-up and exit the function
100
+ fflush(stdout);
101
+ D[8][9] = D[8][9];
102
+ return 0;
103
+ }
104
+