bones-compiler 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (203) hide show
  1. data/CHANGELOG +117 -0
  2. data/LICENSE +9 -0
  3. data/README.rdoc +126 -0
  4. data/Rakefile +107 -0
  5. data/VERSION +1 -0
  6. data/bin/bones +20 -0
  7. data/examples/applications/ffos.c +552 -0
  8. data/examples/benchmarks/2mm.c +70 -0
  9. data/examples/benchmarks/3mm.c +81 -0
  10. data/examples/benchmarks/adi.c +81 -0
  11. data/examples/benchmarks/atax.c +65 -0
  12. data/examples/benchmarks/bicg.c +67 -0
  13. data/examples/benchmarks/cholesky.c +64 -0
  14. data/examples/benchmarks/common.h +168 -0
  15. data/examples/benchmarks/correlation.c +97 -0
  16. data/examples/benchmarks/covariance.c +77 -0
  17. data/examples/benchmarks/doitgen.c +63 -0
  18. data/examples/benchmarks/durbin.c +76 -0
  19. data/examples/benchmarks/dynprog.c +67 -0
  20. data/examples/benchmarks/fdtd-2d-apml.c +114 -0
  21. data/examples/benchmarks/fdtd-2d.c +74 -0
  22. data/examples/benchmarks/floyd-warshall.c +50 -0
  23. data/examples/benchmarks/gemm.c +69 -0
  24. data/examples/benchmarks/gemver.c +89 -0
  25. data/examples/benchmarks/gesummv.c +64 -0
  26. data/examples/benchmarks/gramschmidt.c +84 -0
  27. data/examples/benchmarks/jacobi-1d-imper.c +55 -0
  28. data/examples/benchmarks/jacobi-2d-imper.c +61 -0
  29. data/examples/benchmarks/lu.c +57 -0
  30. data/examples/benchmarks/ludcmp.c +91 -0
  31. data/examples/benchmarks/mvt.c +65 -0
  32. data/examples/benchmarks/overview.txt +38 -0
  33. data/examples/benchmarks/reg_detect.c +82 -0
  34. data/examples/benchmarks/saxpy.c +45 -0
  35. data/examples/benchmarks/seidel-2d.c +51 -0
  36. data/examples/benchmarks/symm.c +74 -0
  37. data/examples/benchmarks/syr2k.c +65 -0
  38. data/examples/benchmarks/syrk.c +62 -0
  39. data/examples/benchmarks/trisolv.c +57 -0
  40. data/examples/benchmarks/trmm.c +57 -0
  41. data/examples/chunk/example1.c +54 -0
  42. data/examples/chunk/example2.c +44 -0
  43. data/examples/chunk/example3.c +59 -0
  44. data/examples/chunk/example4.c +55 -0
  45. data/examples/chunk/example5.c +52 -0
  46. data/examples/element/example1.c +46 -0
  47. data/examples/element/example10.c +50 -0
  48. data/examples/element/example11.c +47 -0
  49. data/examples/element/example12.c +56 -0
  50. data/examples/element/example2.c +46 -0
  51. data/examples/element/example3.c +58 -0
  52. data/examples/element/example4.c +49 -0
  53. data/examples/element/example5.c +56 -0
  54. data/examples/element/example6.c +46 -0
  55. data/examples/element/example7.c +54 -0
  56. data/examples/element/example8.c +45 -0
  57. data/examples/element/example9.c +48 -0
  58. data/examples/neighbourhood/example1.c +54 -0
  59. data/examples/neighbourhood/example2.c +55 -0
  60. data/examples/neighbourhood/example3.c +82 -0
  61. data/examples/neighbourhood/example4.c +52 -0
  62. data/examples/shared/example1.c +45 -0
  63. data/examples/shared/example2.c +51 -0
  64. data/examples/shared/example3.c +55 -0
  65. data/examples/shared/example4.c +52 -0
  66. data/examples/shared/example5.c +48 -0
  67. data/lib/bones.rb +266 -0
  68. data/lib/bones/algorithm.rb +541 -0
  69. data/lib/bones/engine.rb +386 -0
  70. data/lib/bones/preprocessor.rb +161 -0
  71. data/lib/bones/species.rb +196 -0
  72. data/lib/bones/structure.rb +94 -0
  73. data/lib/bones/variable.rb +169 -0
  74. data/lib/bones/variablelist.rb +72 -0
  75. data/lib/castaddon.rb +27 -0
  76. data/lib/castaddon/index.rb +40 -0
  77. data/lib/castaddon/node.rb +753 -0
  78. data/lib/castaddon/type.rb +37 -0
  79. data/skeletons/CPU-C/common/epilogue.c +0 -0
  80. data/skeletons/CPU-C/common/globals.c +17 -0
  81. data/skeletons/CPU-C/common/globals_kernel.c +1 -0
  82. data/skeletons/CPU-C/common/header.c +0 -0
  83. data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
  84. data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
  85. data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
  86. data/skeletons/CPU-C/common/mem_prologue.c +3 -0
  87. data/skeletons/CPU-C/common/prologue.c +0 -0
  88. data/skeletons/CPU-C/common/timer_1_start.c +0 -0
  89. data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
  90. data/skeletons/CPU-C/common/timer_2_start.c +20 -0
  91. data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
  92. data/skeletons/CPU-C/kernel/default.host.c +3 -0
  93. data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
  94. data/skeletons/CPU-C/skeletons.txt +24 -0
  95. data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
  96. data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
  97. data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  98. data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
  99. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
  100. data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  101. data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  102. data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
  103. data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
  104. data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  105. data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  106. data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
  107. data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  108. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  109. data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  110. data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
  111. data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  112. data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
  114. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
  115. data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
  117. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
  118. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
  119. data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
  120. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
  121. data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
  122. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
  123. data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
  124. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
  125. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
  126. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
  127. data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
  128. data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
  129. data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
  130. data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
  131. data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
  132. data/skeletons/CPU-OPENMP/common/globals.c +37 -0
  133. data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
  134. data/skeletons/CPU-OPENMP/common/header.c +0 -0
  135. data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
  136. data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
  137. data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
  138. data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
  139. data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
  140. data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
  141. data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
  142. data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
  143. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
  144. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
  145. data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
  146. data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
  147. data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
  148. data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
  149. data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
  150. data/skeletons/GPU-CUDA/common/globals.c +31 -0
  151. data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
  152. data/skeletons/GPU-CUDA/common/header.c +0 -0
  153. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
  154. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
  155. data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
  156. data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
  157. data/skeletons/GPU-CUDA/common/prologue.c +6 -0
  158. data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
  159. data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
  160. data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
  161. data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
  162. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
  163. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
  164. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
  165. data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
  166. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
  167. data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
  168. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
  169. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
  170. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
  171. data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
  172. data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
  173. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
  174. data/skeletons/GPU-CUDA/skeletons.txt +30 -0
  175. data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
  176. data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
  177. data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
  178. data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
  179. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
  180. data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
  181. data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
  182. data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
  183. data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
  184. data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
  185. data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
  186. data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
  187. data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
  188. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
  189. data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
  190. data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
  191. data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
  192. data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
  193. data/skeletons/verification/header.c +2 -0
  194. data/skeletons/verification/timer_start.c +4 -0
  195. data/skeletons/verification/timer_stop.c +6 -0
  196. data/skeletons/verification/verify_results.c +23 -0
  197. data/test/bones/test_algorithm.rb +40 -0
  198. data/test/bones/test_common.rb +54 -0
  199. data/test/bones/test_preprocessor.rb +46 -0
  200. data/test/bones/test_species.rb +21 -0
  201. data/test/bones/test_variable.rb +84 -0
  202. data/test/test_helper.rb +106 -0
  203. metadata +303 -0
data/CHANGELOG ADDED
@@ -0,0 +1,117 @@
1
+ ###################
2
+ ### v1.1 ###
3
+ ###################
4
+
5
+ General:
6
+ - Added support for a compiler optimisation pass: array substitution with a local register copy in the case of chunk to element species.
7
+ - Added support for a compiler optimisation pass: thread-merging, potentially improving re-use through locality at the cost of parallelism.
8
+ - Updated and added examples.
9
+ - Tuned the skeletons.
10
+
11
+ Skeletons performance/readability/usability tuning:
12
+ - Added a special GPU-CUDA 'chunk(1,N)' skeleton with pre-shuffling (e.g. atax, bicg, mvt, syrk). Added support for a corresponding transformation in Bones.
13
+ - Added a special GPU-CUDA 2x'chunk(1,N)' skeleton with pre-shuffling if there are two chunk inputs (e.g. gesummv,syr2k).
14
+ - Updated the various OpenCL skeletons to create only once the 'context', 'command queue' and 'program'. This saves significant time for programs executing a single kernel multiple times or multiple kernels subsequently.
15
+ - Tune GPU-CUDA skeletons to prefer L1 cache above scratchpad memory.
16
+
17
+ Examples:
18
+ - Updated several benchmarks with additional species.
19
+ - Added the 'saxpy' benchmark.
20
+ - Added 'example12' to demonstrate the use of species in a separate function.
21
+
22
+ Bug fixes:
23
+ - Removed the generation of global code in case no species are detected.
24
+
25
+ Miscellaneous:
26
+ - Renamed the original main function and created a new 'main' in 'common/globals.c' with initialisation and clean-up. The use of '#pragma species initialize' is now deprecated, but a main function is required in the original code.
27
+ - Added the identification of a species' function, making variable definition detection local.
28
+ - Improved the compiler run-time for algorithms for which a skeleton is unavailable.
29
+ - Added a command-line option to be able to generate code for a single species only (-only_alg_number).
30
+ - Added a command-line option to set a fixed thread-merge factor (-merge_factor).
31
+ - Added a gemspec file.
32
+ - Updated the documentation and readme.
33
+
34
+ ###################
35
+ ### v1.0 ###
36
+ ###################
37
+
38
+ General:
39
+ - Added a benchmark set to the examples directory based on the PolyBench/C benchmark set.
40
+ - Performed major code refactoring, improving maintainability and performance of Bones.
41
+ - Updated all examples to work with the upcoming 'automatic species extraction tool' (ASET).
42
+
43
+ Skeletons performance/readability/usability tuning:
44
+ - Updated the 'GPU-CUDA' reduction skeleton to support initial values, loops not starting at zero or not a power of 2, and more (see the new 'shared/example5.c').
45
+ - Improved the way skeletons are matched with species. Introduced a 'default' skeleton and modified others to accomodate these changes.
46
+
47
+ Improved code/species support:
48
+ - Added atomic support for OpenCL based targets
49
+ - Added support for loop iterator variables inside the 'algorithmic species' with selective memory copy for the 'GPU-CUDA' target, see 'element/example11.c'.
50
+
51
+ Bug fixes:
52
+ - Fixed a compatibility problem with CAST version 0.2.0.
53
+
54
+ Miscellaneous:
55
+ - Added proper error messaging to catch various exceptions
56
+ - Reversed the order of loop-flattening to obtain coalesced memory accesses (in particular for the 'GPU-CUDA' target).
57
+ - Renamed 'tile' into 'chunk', since 'tile' might imply using a 2D chunk of data.
58
+ - Changed '#pragma bones' into '#pragma species' to match the algorithmic species naming.
59
+ - Added a warning message for negative or zero range dimensions.
60
+ - Organized the search-and-replace parameters found in skeletons, renamed most of them and added a few new.
61
+ - Added headers to the code examples and cleaned them up.
62
+ - Added a warning message if an 'endkernel' pragma is missing.
63
+ - Improved the way error messages are thrown.
64
+
65
+ ###################
66
+ ### v0.9 (Beta) ###
67
+ ###################
68
+
69
+ General:
70
+ - Implemented the new 'algorithmic species' using ranges, thus creating support for for-loops with affine functions as loop-bounds.
71
+ - Updated the examples
72
+
73
+ Supported targets:
74
+ - Changed the names of the targets 'GPU-OPENCL' and 'CPU-OPENCL' into 'GPU-OPENCL-AMD' and 'CPU-OPENCL-INTEL' respectively.
75
+ - Added a new target 'CPU-C' which implements a C-to-C pass-through.
76
+ - Added a new target 'CPU-OPENMP', which uses OpenMP to create 4 CPU threads.
77
+ - Added a new target 'CPU-OPENCL-AMD'. This target is similar to 'CPU-OPENCL-INTEL', but targets the AMD APP.
78
+
79
+ Skeletons performance/readability/usability tuning:
80
+ - Added a basic prefetching technique in the local memory for the neighbourhood skeleton for the 'GPU-CUDA' target.
81
+ - Removed the first entry in the transformation settings, which was used in previous versions to set the dimensions (now automatically detected).
82
+ - Changed the 'GPU-CUDA' skeletons such that host files can be compiled with a C99 compiler.
83
+ - Tuned 'CPU-OPENCL-INTEL' performance for Intel's OpenCL SDK.
84
+ - Created aligned memory allocation functions to enable zero-copy possibilities for the 'CPU-OPENCL-INTEL' target.
85
+ - Completed the addition of 'bones_' for every variable in the skeletons.
86
+
87
+ Verification code:
88
+ - Create a verification function specific to each output.
89
+ - Moved the verification code (including the original code) to a separate file, which is only generated if '-c' is provided as a flag to Bones.
90
+
91
+ Bug fixes:
92
+ - Fixed a bug which would create function names starting with a digit.
93
+ - Adjusted the use of directory structures in the code for Windows-compatibility.
94
+ - Fixed a bug where variables for verification would have duplicate names.
95
+ - Fixed a bug where verification code would not compile for 'unsigned int' types.
96
+ - Fixed a memory leak which would occur when verification is enabled.
97
+ - Fixed a bug where statements of the form 'a[i]++' would not be recognized as input nor as output. They are now rewritten as 'a[i]=a[i]+1'.
98
+
99
+ Miscellaneous:
100
+ - Added support for selective copying-out (based on array access ranges)
101
+ - Added support for defines found in header functions (pre-processor now also pre-processes the header files)
102
+ - Added the possibility to specify the order of inputs/outputs in the classification by giving their names (if not given, the default ordering is assumed).
103
+ - Writing to a specific location in an array followed by a read no longer considers the array as input and output, it is now output only.
104
+ - Added a check to see if for-loops start and end as expected (as provided by the ranges given through the 'algorithmic species').
105
+ - Create a 'simplify' function, which simplifies math expressions to a certain extend. A test is included to give a few examples of what it can do.
106
+ - Clean-up of the Rakefile, addition of stub tasks to compile and execute example code, and the addition of an 'add new target' task.
107
+ - Changed the code such that the core components of Bones (the 'lib' folder) do not have to be adjusted to add a new target.
108
+ - Added performance measurement for original code in case verification is enabled.
109
+ - Renamed 'Tribe' into 'Species' and 'Primitive' into 'Algorithm'.
110
+
111
+ ###################
112
+ ### v0.8 (Beta) ###
113
+ ###################
114
+
115
+ Initial release.
116
+
117
+ ###################
data/LICENSE ADDED
@@ -0,0 +1,9 @@
1
+ Copyright (c) 2012 Cedric Nugteren, Eindhoven University of Technology, The Netherlands
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8
+
9
+
data/README.rdoc ADDED
@@ -0,0 +1,126 @@
1
+ = Bones
2
+ Recent advances in multi-core and many-core processors requires programmers to exploit an increasing amount of parallelism from their applications. Data parallel languages such as CUDA and OpenCL make it possible to take advantage of such processors, but still require a large amount of effort from programmers. To address the challenge of parallel programming, we introduce Bones.
3
+
4
+ Bones is a source-to-source compiler based on algorithmic skeletons and a new algorithm classification (named 'algorithmic species'). The compiler takes C-code annotated with class information as input and generates parallelized target code. Targets include NVIDIA GPUs (through CUDA), AMD GPUs (through OpenCL) and CPUs (through OpenCL and OpenMP). Bones is open-source, written in the Ruby programming language, and is available through our website. The compiler is based on the C-parser CAST, which is used to parse the input code into an abstract syntax tree (AST) and to generate the target code from a transformed AST.
5
+
6
+ == Usage
7
+ The usage is as follows:
8
+ bones --application <input> --target <target> [OPTIONS]
9
+ With the following flags:
10
+ --application, -a <s>: Input application file
11
+ --target, -t <s>: Target processor (choose from: GPU-CUDA, GPU-OPENCL-AMD,
12
+ CPU-OPENCL-INTEL, CPU-OPENCL-AMD, CPU-OPENMP, CPU-C)
13
+ --measurements, -m: Enable/disable timers
14
+ --verify, -c: Verify correctness of the generated code
15
+ --version, -v: Print version and exit
16
+ --help, -h: Show this message
17
+
18
+ Bones can be invoked from the command-line. Two arguments (-a and -t) are mandatory, others are optional. This is an example of the usage of Bones assuming the file '+example.c+' to be present:
19
+ bones -a example.c -t GPU-CUDA -c
20
+
21
+ == Examples
22
+ The best place to start experimenting with Bones is the '+examples+' directory. A large number of examples are available in this folder, grouped by algorithmic species (either element, neighbourhood, shared or chunk). The examples illustrate different kinds of coding styles and give a large number of different classes to work with. The folder '+benchmarks+' gives more examples, taken from the PolyBench/C benchmark set. Additionally, a folder '+applications+' is included, containing example complete applications. Currently, the following examples are available:
23
+ |-- element |-- applications
24
+ | |-- example1.c | \-- ffos.c
25
+ | |-- example2.c \-- benchmarks
26
+ | |-- example3.c |-- 2mm.c
27
+ | |-- example4.c |-- 3mm.c
28
+ | |-- example5.c |-- adi.c
29
+ | |-- example6.c |-- atax.c
30
+ | |-- example7.c |-- bicg.c
31
+ | |-- example8.c |-- cholesky.c
32
+ | |-- example9.c |-- correlation.c
33
+ | |-- example10.c |-- covariance.c
34
+ | |-- example11.c |-- doitgen.c
35
+ | \-- example12.c |-- durbin.c
36
+ |-- neighbourhood |-- dynprog.c
37
+ | |-- example1.c |-- fdtd-2d-apml.c
38
+ | |-- example2.c |-- fdtd-2d.c
39
+ | |-- example3.c |-- floyd-warshall.c
40
+ | \-- example4.c |-- gemm.c
41
+ |-- shared |-- gemver.c
42
+ | |-- example1.c |-- gesummv.c
43
+ | |-- example2.c |-- jacobi-1d-imper.c
44
+ | |-- example3.c |-- jacobi-2d-imper.c
45
+ | |-- example4.c |-- lu.c
46
+ | \-- example5.c |-- ludcmp.c
47
+ |-- chunk |-- mvt.c
48
+ | |-- example1.c |-- reg_detect.c
49
+ | |-- example2.c |-- saxpy.c
50
+ | |-- example3.c |-- seidel-2d.c
51
+ | |-- example4.c |-- syr2k.c
52
+ | \-- example5.c |-- syrk.c
53
+ |-- trisolv.c
54
+ \-- trmm.c
55
+
56
+ All examples can be ran through Bones for a specific target using an automated Rake task. Executing '<tt>rake examples:generate</tt>' or simply '+rake+' will execute Bones for all examples for a given target. The target can be changed in the '+Rakefile+' found in the root directory of Bones.
57
+
58
+ == Limitations
59
+ Bones takes C99 source code as input. However, several coding styles are unsupported as of now or might yield worse performance compared to others. The numerous examples provided should give the user an idea of the possibilities and limitations of the tool. A complete list of coding guidelines and limitations will follow in the future. Currently, an initial list of major limitations and guidelines is given below. In this list, we use '+algorithm+' to denote an algorithm captured by an algorithmic species.
60
+ * If the algorithm works on a N-dimensional data structure, use N-dimensional arrays (don't flatten it yourself, e.g. use '<tt>example[i][j]</tt>' instead of '<tt>example[i+j*A]</tt>') and specify an N-dimensional algorithmic species.
61
+ * Write your while-loops as for-loops if possible. For-loops should have a unit increment, other loops (e.g. decrementing loops) must be re-written.
62
+ * Loops can have affine bounds containing constants, defines and variables. Variables should not include loop variables of loops that are part of the '+algorithm+'.
63
+ * Function calls are not allowed within the '+algorithm+'. Some mathematical functions are allowed.
64
+ * Variables are allowed in the definition of an algorithmic species. If they are used, the should also be used somewhere in the body of the '+algorithm+'.
65
+ * Bones is designed to work on a single input file with at least a function called 'main'. If your (to-be-accelerated) code spawns over multiple C-files, Bones could either be applied multiple times, or the code could be merged into a single file.
66
+
67
+
68
+
69
+ = Installation procedure
70
+ Installation of Bones is a simple matter of extracting the Bones package to a directory of your choice or installing the gem ('<tt>gem install bones-compiler</tt>'). However, there are a number of prerequisites.
71
+
72
+ == Prerequisites
73
+ Bones requires the installation of Ruby, the Rubygems gem package manager and two gems:
74
+ 1. Any version of *Ruby* *1.8* or *1.9*. Information on Ruby is found at http://www.ruby-lang.org
75
+ * [OS X]: Ruby is pre-installed on any OS X system since Tiger (10.4).
76
+ * [Linux]: Ruby is pre-installed on some Linux based systems. Most Linux package managers (yum, apt-get) will be able to provide a Ruby installation. Make sure that the ruby development package ('+ruby-devel+') is also installed, as it is required by one of the gems.
77
+ * [Windows]: Ruby for Windows can be obtained from http://rubyinstaller.org/
78
+ 2. The *Rubygems* gem package manager. Information on Rubygems can be found at http://rubygems.org
79
+ * [OS X]: Rubygems is pre-installed on any OS X system since Tiger (10.4).
80
+ * [Linux]: Most Linux package managers will be able to provide a Rubygems installation by installing the package '+rubygems+'.
81
+ * [Windows]: Rubygems for Windows is obtained automatically when installing from http://rubyinstaller.org/
82
+ 3. Bones requires two gems, *trollop* and *cast*. Both gems can be installed by calling Rubygems from the command line, i.e.: '<tt>gem install trollop cast</tt>'.
83
+
84
+ For example, all prerequisites can be installed as follows on a Fedora, Red-Hat or CentOS system:
85
+ yum install ruby ruby-devel rubygems
86
+ gem install trollop cast
87
+ For an Ubuntu, Debian or Mint system, the equivalent commands are:
88
+ apt-get install ruby ruby-devel rubygems
89
+ gem install trollop cast
90
+
91
+ == Installing Bones
92
+ To install the compiler, simply extract the '<tt>bones\_x.x.tar.gz</tt>' package to a directory of your choice. The Bones executable is found in the '+bin+' subdirectory. Including the path to the '+bin+' directory to your environmental variable '+PATH+' will make Bones available from any directory on your machine. Starting at version 1.1, Bones is also available as a gem ('<tt>gem install bones-compiler</tt>').
93
+
94
+
95
+ = Documentation
96
+ There are two ways to go to obtain more information regarding Bones. To obtain more information about the compiler itself, the ideas behind it and the algorithm classification, it is a good idea to read scientific publications. To get more information about the code structure, HTML documentation can be generated automatically using RDoc.
97
+
98
+ == Code documentation
99
+ Code documentation can be generated automatically using RDoc. Navigate to the installation root of Bones and use Rake to generate documentation: '<tt>rake rdoc</tt>'. More information on using Rake is provided later in this document. Next, open '<tt>rdoc/index.html</tt>' to navigate through the documentation. The same documentation is also available on the web at http://parse.ele.tue.nl/tools/bones/rdoc/.
100
+
101
+ == Scientific publications
102
+ Scientific publications related to Bones can be obtained from http://parse.ele.tue.nl/publications. Two publications are relevant:
103
+ 1. <b>A Modular and Parameterisable Classification of Algorithms</b>, which provides details on the used algorithm classification. When refering to the algorithm classification in scientific work, you are kindly asked to include the following citations:
104
+
105
+ @TECHREPORT{Nugteren2011,
106
+ author = {Cedric Nugteren and Henk Corporaal},
107
+ title = {{A Modular and Parameterisable Classification of Algorithms}},
108
+ institution = {Eindhoven University of Technology},
109
+ year = {2011},
110
+ number = {No. ESR-2011-02},
111
+ }
112
+ 2. <b>Introducing 'Bones': A Parallelizing Source-to-Source Compiler Based on Algorithmic Skeletons</b>, which introduces the Bones source-to-source compiler. When refering to Bones in scientific work, you are kindly asked to include the following citations:
113
+
114
+ @INPROCEEDINGS{Nugteren2012,
115
+ author = {Cedric Nugteren and Henk Corporaal},
116
+ title = {{Introducing `Bones': A Parallelizing Source-to-Source Compiler
117
+ Based on Algorithmic Skeletons}},
118
+ booktitle = {{GPGPU-5: 5th Workshop on General Purpose Processing on
119
+ Graphics Processing Units}},
120
+ year = {2012},
121
+ }
122
+
123
+
124
+
125
+ = Questions
126
+ Questions can be directed by email. You can find contact details on the personal page of the author at http://parse.ele.tue.nl/cnugteren or on the project page at github.
data/Rakefile ADDED
@@ -0,0 +1,107 @@
1
+ require 'rake/testtask'
2
+ require 'rdoc/task'
3
+ require 'rake/clean'
4
+
5
+ # Set the location of the examples
6
+ EXAMPLES = File.join('examples','benchmarks','*.c')
7
+
8
+ # Set the clean/clobber tasks
9
+ CLOBBER.include(Dir[File.join('examples','*','*_*-*')])
10
+
11
+ # Pick a target from a list of possible targets
12
+ # 0 1 2 3 4 5
13
+ TARGETS = ['GPU-CUDA','GPU-OPENCL-AMD','CPU-OPENCL-INTEL','CPU-OPENCL-AMD','CPU-OPENMP','CPU-C']
14
+ TARGET = TARGETS[0]
15
+
16
+ # Settings for Bones
17
+ MEASUREMENTS = true
18
+ VERIFICATION = true
19
+
20
+ # Small helper function to display text on screen
21
+ def display(text)
22
+ print '[Rake] ### '+text+': '
23
+ p
24
+ end
25
+
26
+ # Set the default task
27
+ task :default => [:examples]
28
+
29
+ # Rake tasks related to the examples
30
+ namespace :examples do
31
+
32
+ # Task to process and test everything (generating code, compiling code, executing)
33
+ desc 'Run all examples through Bones, compile them, and execute them'
34
+ task :verify, [:file] => [:generate, :compile, :execute] do |t, args|
35
+ end
36
+
37
+ # Task to pass examples through Bones
38
+ desc 'Generate target code using Bones'
39
+ task :generate, :file do |t, args|
40
+ args.with_defaults(:file => EXAMPLES)
41
+ Dir[args.file].sort.each do |file|
42
+ display('Generating')
43
+ options = (MEASUREMENTS ? '-m ' : '') + (VERIFICATION ? '-c ' : '')
44
+ sh "bin/bones -a #{file} -t #{TARGET} #{options}"
45
+ end
46
+ end
47
+
48
+ # Task to compile the generated code for the examples (NOTE: this task is a stub)
49
+ desc 'Compile all examples (using gcc/nvcc)'
50
+ task :compile, :file do |t, args|
51
+ args.with_defaults(:file => EXAMPLES)
52
+ Dir[args.file].sort.each do |file|
53
+ compile(file,TARGET)
54
+ end
55
+ end
56
+
57
+ # Task to execute the compiled code for the examples (NOTE: this task is a stub)
58
+ desc 'Execute all examples'
59
+ task :execute, :file do |t, args|
60
+ args.with_defaults(:file => EXAMPLES)
61
+ Dir[args.file].sort.each do |file|
62
+ execute(file,TARGET)
63
+ end
64
+ end
65
+
66
+ # Helper function to compile code
67
+ #def compile(file,target)
68
+ # (system-specific, to be filled in by the user)
69
+ #end
70
+
71
+ # Helper function to execute code
72
+ #def execute(file,target)
73
+ # (system-specific, to be filled in by the user)
74
+ #end
75
+
76
+ end
77
+ task :examples => ['examples:generate']
78
+
79
+ # Task which adds a new target to the skeleton library based on an existing target
80
+ desc 'Adds a new target to the skeleton library'
81
+ task :add_target, :name, :base do |t, args|
82
+ args.with_defaults(:name => 'NEW-TARGET', :base => 'CPU-OPENMP')
83
+ base = 'skeletons/'+args.base
84
+ name = 'skeletons/'+args.name
85
+ if File.exists?(base) && !File.exists?(name)
86
+ sh "cp -r #{base} #{name}"
87
+ else
88
+ puts '[Rake] ### Error adding new target'
89
+ end
90
+ end
91
+
92
+ # Test individual parts of the code
93
+ Rake::TestTask.new do |test|
94
+ test.test_files = FileList[File.join('test','*','test_*.rb')]
95
+ test.verbose = false
96
+ end
97
+
98
+ # Generate HTML documentation using RDoc
99
+ RDoc::Task.new do |rdoc|
100
+ rdoc.title = 'Bones'
101
+ rdoc.options << '--line-numbers'
102
+ rdoc.rdoc_files.include(File.join('lib','**','*.rb'))
103
+ rdoc.rdoc_files.include('README.rdoc')
104
+ rdoc.rdoc_dir = 'rdoc'
105
+ rdoc.main = 'README.rdoc'
106
+ end
107
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.1.0
data/bin/bones ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Set the path for the libraries and the installation directory
4
+ BONES_DIR = File.dirname(__FILE__) + '/../'
5
+ lib_dir = File.join(BONES_DIR,'lib')
6
+ $LOAD_PATH.unshift lib_dir unless $LOAD_PATH.include?(lib_dir)
7
+
8
+ # Enable verbose output
9
+ VERBOSE = true
10
+
11
+ # Load the libraries
12
+ require 'castaddon.rb'
13
+ require 'bones.rb'
14
+
15
+ # Run the main function
16
+ bones = Bones::Engine.new
17
+ bones.process
18
+ bones.write_output
19
+
20
+
@@ -0,0 +1,552 @@
1
+ //
2
+ // This file is part of the Bones source-to-source compiler examples. This C-code
3
+ // demonstrates the use of Bones for an example (real) application: 'Fast Focus
4
+ // on Structures' (FFOS). For more information on the application or on Bones
5
+ // please use the contact information below.
6
+ //
7
+ // == More information on the FFOS application
8
+ // Contact............Yifan He / Zhenyu Ye
9
+ // Web address........http://zhenyu-ye.net/publications/acivs2011/yifan2011acivs.pdf
10
+ //
11
+ // == More information on Bones
12
+ // Contact............Cedric Nugteren <c.nugteren@tue.nl>
13
+ // Web address........http://parse.ele.tue.nl/bones/
14
+ //
15
+ // == File information
16
+ // Filename...........applications/ffos.c
17
+ // Author.............Cedric Nugteren
18
+ // Last modified on...22-May-2012
19
+ //
20
+
21
+ //########################################################################
22
+ //### Includes
23
+ //########################################################################
24
+
25
+ #include <stdio.h>
26
+ #include <stdlib.h>
27
+ #include <math.h>
28
+ #include <time.h>
29
+
30
+ //########################################################################
31
+ //### Defines
32
+ //########################################################################
33
+
34
+ #define XVECTORS 10
35
+ #define YVECTORS 10
36
+
37
+ //########################################################################
38
+ //### Forward declarations
39
+ //########################################################################
40
+
41
+ void SaveBMPFile(unsigned char ** image, const char * outputdestination, int width, int height);
42
+ unsigned char ** LoadBMPFile(int *width, int *height);
43
+ void CPU_FindCenters(int* vector, int *coordinates, int size);
44
+ void CPU_Visualize(unsigned char** image0, int* Xcoordinates, int* Ycoordinates, unsigned char **image3, int width, int height);
45
+ void CPU_BCV(int *histogram, float *BCVtable, int size);
46
+
47
+ //########################################################################
48
+ //### Global variables
49
+ //########################################################################
50
+
51
+ int messages = 2;
52
+
53
+ //########################################################################
54
+ //### Start of the main function
55
+ //########################################################################
56
+
57
+ int main(void) {
58
+
59
+ // Declare loop variables
60
+ int i,h,w,a;
61
+
62
+ // Set other variables
63
+ int threshold = 0;
64
+ int hist[256];
65
+ for (i=0;i<256;i++) { hist[i] = 0; }
66
+ float * BCVtable = (float *)malloc(256*sizeof(float));
67
+
68
+ // Loading image0 from disk
69
+ if (messages == 2) { printf("### Loading image0 from disk.\n"); }
70
+ int width = 0;
71
+ int height = 0;
72
+ unsigned char ** image0 = LoadBMPFile(&width, &height);
73
+
74
+ // Create space for image1
75
+ if (messages == 2) { printf("### Allocating space for image1.\n"); }
76
+ unsigned char ** image1 = (unsigned char **)malloc(width*sizeof(*image1));
77
+ unsigned char * image1_1D = (unsigned char *)malloc(width*height*sizeof(unsigned char));
78
+ for(i=0;i<width;i++) { image1[i] = &image1_1D[i*height]; }
79
+
80
+ // Create space for image2
81
+ if (messages == 2) { printf("### Allocating space for image2.\n"); }
82
+ unsigned char ** image2 = (unsigned char **)malloc(width*sizeof(*image2));
83
+ unsigned char * image2_1D = (unsigned char *)malloc(width*height*sizeof(unsigned char));
84
+ for(i=0;i<width;i++) { image2[i] = &image2_1D[i*height]; }
85
+
86
+ // Create space for image3
87
+ if (messages == 2) { printf("### Allocating space for image3.\n"); }
88
+ unsigned char ** image3 = (unsigned char **)malloc(width*sizeof(*image3));
89
+ unsigned char * image3_1D = (unsigned char *)malloc(width*height*sizeof(unsigned char));
90
+ for(i=0;i<width;i++) { image3[i] = &image3_1D[i*height]; }
91
+
92
+ // Create space for projection vectors
93
+ if (messages == 2) { printf("### Allocating space for projection vectors.\n"); fflush(stdout); }
94
+ int * Xvector = (int *)malloc(width*sizeof(int));
95
+ int * Yvector = (int *)malloc(height*sizeof(int));
96
+
97
+ // Create coordinate arrays
98
+ if (messages == 2) { printf("### Allocating space for coordinate arrays.\n"); fflush(stdout); }
99
+ int Xcoordinates[XVECTORS]; for(i=0;i<XVECTORS;i++) { Xcoordinates[i] = 0; }
100
+ int Ycoordinates[YVECTORS]; for(i=0;i<YVECTORS;i++) { Ycoordinates[i] = 0; }
101
+
102
+ //########################################################################
103
+ //### PART1: Histogramming (accelerated)
104
+ //########################################################################
105
+ if (messages >= 1) { printf("### PART1: Histogramming.\n"); fflush(stdout); }
106
+
107
+ #pragma species kernel 0:height-1,0:width-1|element -> 0:255|shared
108
+ for (h=0;h<height;h++) {
109
+ for (w=0;w<width;w++) {
110
+ hist[image0[h][w]] = hist[image0[h][w]] + 1;
111
+ }
112
+ }
113
+ #pragma species endkernel histogram
114
+
115
+ //########################################################################
116
+ //### Between class variance (CPU)
117
+ //########################################################################
118
+ if (messages == 2) { printf("### Create a between class variance table.\n"); fflush(stdout); }
119
+ CPU_BCV(hist, BCVtable, width*height);
120
+
121
+ //########################################################################
122
+ //### PART2: Search for the maximum (accelerated)
123
+ //########################################################################
124
+ if (messages >= 1) { printf("### PART2: Search for the maximum value.\n"); fflush(stdout); }
125
+ float maximum[1];
126
+ maximum[0] = 10;
127
+ int length = 256;
128
+
129
+ //#pragma species kernel 0:255|element -> 0:0|shared
130
+ for (i=0;i<length;i++) {
131
+ maximum[0] = (BCVtable[i] > maximum[0]) ? BCVtable[i] : maximum[0];
132
+ }
133
+ //#pragma species endkernel maximum_1
134
+
135
+ if (messages == 2) { printf("### Maximum is %.3lf.\n",maximum[0]); fflush(stdout); }
136
+
137
+ //########################################################################
138
+ //### PART3: Search for the maximum - larger synthetic example (accelerated)
139
+ //########################################################################
140
+ if (messages >= 1) { printf("### PART3: Search for the maximum value (synthetic example).\n"); fflush(stdout); }
141
+ int vector_size = 2097152; // 2048x1024
142
+ float* synthetic_vector = (float*)malloc(sizeof(float)*vector_size);
143
+ srand(time(NULL));for (i=0;i<vector_size;i++) { synthetic_vector[i] = (rand() % 7777777) / 1000.0; }
144
+ float result[1];
145
+ result[0] = 0;
146
+
147
+ //#pragma species kernel 0:2097151|element -> 0:0|shared
148
+ for (i=0;i<vector_size;i++) {
149
+ result[0] = (synthetic_vector[i] > result[0]) ? synthetic_vector[i] : result[0];
150
+ }
151
+ //#pragma species endkernel maximum_2
152
+
153
+ if (messages == 2) { printf("### Maximum is %.3lf.\n",result[0]); fflush(stdout); }
154
+
155
+ //########################################################################
156
+ //### Search for the index of the maximum (CPU)
157
+ //########################################################################
158
+ if (messages == 2) { printf("### Search for the index of the maximum value.\n"); fflush(stdout); }
159
+ for (i=0;i<256;i++) {
160
+ if (BCVtable[i] == maximum[0]) {
161
+ threshold = i;
162
+ break;
163
+ }
164
+ }
165
+
166
+ //########################################################################
167
+ //### PART4: Binarization (accelerated)
168
+ //########################################################################
169
+ if (messages >= 1) { printf("### PART4: Binarization with treshold at %d.\n",threshold); fflush(stdout); }
170
+
171
+ #pragma species kernel 0:height-1,0:width-1|element -> 0:height-1,0:width-1|element
172
+ for (h=0;h<height;h++) {
173
+ for (w=0;w<width;w++) {
174
+ if (image0[h][w] > threshold) { image1[h][w] = 1; }
175
+ else { image1[h][w] = 0; }
176
+ }
177
+ }
178
+ #pragma species endkernel threshold
179
+
180
+ //########################################################################
181
+ //### PART5: Erosion 7x7 (accelerated)
182
+ //########################################################################
183
+ if (messages >= 1) { printf("### PART5: Perform the erode kernel.\n"); fflush(stdout); }
184
+
185
+ int condition;
186
+ #pragma species kernel 7:height-8,7:width-8|neighbourhood(-3:3,-3:3) -> 0:height-1,0:width-1|element
187
+ for (h=0;h<height;h++) {
188
+ for (w=0;w<width;w++) {
189
+ if (w >= 7 && h >= 7 && w <= width-7 && h <= height-7) {
190
+ condition = 1;
191
+ for(a=-3;a<=3;a++) {
192
+ condition = condition
193
+ * image1[(h-3)][(w+a)]
194
+ * image1[(h-2)][(w+a)]
195
+ * image1[(h-1)][(w+a)]
196
+ * image1[(h+0)][(w+a)]
197
+ * image1[(h+1)][(w+a)]
198
+ * image1[(h+2)][(w+a)]
199
+ * image1[(h+3)][(w+a)]
200
+ ;
201
+ }
202
+ if (condition == 1) { image2[h][w] = 255; }
203
+ else { image2[h][w] = 0; }
204
+ }
205
+ else {
206
+ image2[h][w] = 0;
207
+ }
208
+ }
209
+ }
210
+ #pragma species endkernel erosion
211
+
212
+ //########################################################################
213
+ //### PART6: 1D erosion(7) synthetic example (accelerated)
214
+ //########################################################################
215
+ if (messages >= 1) { printf("### PART6: Perform the erode kernel (1D - synthetic).\n"); fflush(stdout); }
216
+ int vector_size2 = 2097152; // 2048x1024
217
+ int* vector2a = (int*)malloc(sizeof(int)*vector_size2);
218
+ int* vector2b = (int*)malloc(sizeof(int)*vector_size2);
219
+ srand(time(NULL));
220
+ for (i=0;i<vector_size2;i++) {
221
+ if (rand()%15 > 1) { vector2a[i] = 1; }
222
+ else { vector2a[i] = 0; }
223
+ }
224
+
225
+ //#pragma species kernel 0:2097151|neighbourhood(-3:3) -> 0:2097151|element
226
+ for (i=0;i<vector_size2;i++) {
227
+ if (i >= 7 && i <= vector_size2-7) {
228
+ condition = 1;
229
+ for(a=-3;a<=3;a++) {
230
+ condition = condition * vector2a[i+a];
231
+ }
232
+ if (condition == 1) { vector2b[i] = 255; }
233
+ else { vector2b[i] = 0; }
234
+ }
235
+ else {
236
+ vector2b[i] = 0;
237
+ }
238
+ }
239
+ //#pragma species endkernel erosion1d
240
+
241
+ // Compute a gold reference
242
+ int gold = 0;
243
+ int gold_condition = 1;
244
+ for(a=-3;a<=3;a++) { gold_condition = gold_condition * vector2a[10+a]; }
245
+ if (gold_condition == 1) { gold = 255; }
246
+ if (messages == 2) { printf("### Result at index 10 is %d and should be %d.\n",vector2b[10],gold); fflush(stdout); }
247
+
248
+ //########################################################################
249
+ //### PART7: Y-projection (accelerated)
250
+ //########################################################################
251
+ if (messages >= 1) { printf("### PART7: Starting the Y-projection algorithm.\n"); fflush(stdout); }
252
+
253
+ int result_yp;
254
+ #pragma species kernel 0:height-1,0:width-1|chunk(0:height-1,0:0) -> 0:width-1|element
255
+ for (w=0;w<width;w++) {
256
+ result_yp = 0;
257
+ for (h=0;h<height;h++) {
258
+ if (image2[h][w] == 255) {
259
+ result_yp = 255;
260
+ }
261
+ }
262
+ Yvector[w] = result_yp;
263
+ }
264
+ #pragma species endkernel y_projection
265
+
266
+ //########################################################################
267
+ //### PART8: X-projection (accelerated)
268
+ //########################################################################
269
+ if (messages >= 1) { printf("### PART8: Starting the X-projection algorithm.\n"); fflush(stdout); }
270
+
271
+ int result_xp;
272
+ #pragma species kernel 0:height-1,0:width-1|chunk(0:0,0:width-1) -> 0:height-1|element
273
+ for (h=0;h<height;h++) {
274
+ result_xp = 0;
275
+ for (w=0;w<width;w++) {
276
+ if (image2[h][w] == 255) {
277
+ result_xp = 255;
278
+ }
279
+ }
280
+ Xvector[h] = result_xp;
281
+ }
282
+ #pragma species endkernel x_projection
283
+
284
+ //########################################################################
285
+ //### Search for the centers of the projection vectors (CPU)
286
+ //########################################################################
287
+ if (messages == 2) { printf("### Search for X- and Y-projection vectors.\n"); fflush(stdout); }
288
+ CPU_FindCenters(Xvector, Xcoordinates, width);
289
+ CPU_FindCenters(Yvector, Ycoordinates, height);
290
+
291
+ //########################################################################
292
+ //### Visualize, save to disk and finalize the program
293
+ //########################################################################
294
+ CPU_Visualize(image0, Xcoordinates, Ycoordinates, image3, width, height);
295
+ SaveBMPFile(image1, "output1.bmp", width, height);
296
+ SaveBMPFile(image2, "output2.bmp", width, height);
297
+ SaveBMPFile(image3, "output3.bmp", width, height);
298
+ free(image0);
299
+ free(image1);
300
+ free(image1_1D);
301
+ free(image2);
302
+ free(image2_1D);
303
+ free(image3);
304
+ free(image3_1D);
305
+ free(Xvector);
306
+ free(Yvector);
307
+ free(BCVtable);
308
+ if (messages == 2) { printf("### End of program\n"); fflush(stdout); }
309
+ return 0;
310
+ }
311
+
312
+ //########################################################################
313
+ //### Structures used in the BMP functions
314
+ //########################################################################
315
+
316
+ typedef struct {
317
+ int size;
318
+ int reserved;
319
+ int offset;
320
+ } BMPHeader;
321
+ typedef struct {
322
+ int size;
323
+ int width;
324
+ int height;
325
+ int planesBitsPerPixel;
326
+ int compression;
327
+ int imageSize;
328
+ int xPelsPerMeter;
329
+ int yPelsPerMeter;
330
+ int clrUsed;
331
+ int clrImportant;
332
+ } BMPInfoHeader;
333
+
334
+ //########################################################################
335
+ //### Function to save BMP data to a file
336
+ //########################################################################
337
+
338
+ void SaveBMPFile(unsigned char ** image, const char * outputdestination, int width, int height)
339
+ {
340
+ // Variable declarations
341
+ int x,y,j;
342
+ FILE *fd_out;
343
+ unsigned long ulBitmapSize = (height * width * 3)+54;
344
+ char ucaBitmapSize[4];
345
+ ucaBitmapSize[3]= (ulBitmapSize & 0xFF000000) >> 24;
346
+ ucaBitmapSize[2]= (ulBitmapSize & 0x00FF0000) >> 16;
347
+ ucaBitmapSize[1]= (ulBitmapSize & 0x0000FF00) >> 8;
348
+ ucaBitmapSize[0]= (ulBitmapSize & 0x000000FF);
349
+
350
+ // Load output file
351
+ fd_out = fopen(outputdestination, "wb");
352
+
353
+ // Write BMP header
354
+ fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 66, 77, ucaBitmapSize[0], ucaBitmapSize[1], ucaBitmapSize[2], ucaBitmapSize[3], 0, 0, 0, 0);
355
+ fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 54, 0, 0, 0, 40, 0 , 0, 0, (width & 0x00FF), (width & 0xFF00)>>8);
356
+ fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 0, 0, (height & 0x00FF), (height & 0xFF00) >> 8, 0, 0, 1, 0, 24, 0);
357
+ fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
358
+ fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
359
+ fprintf(fd_out,"%c%c%c%c", 0, 0 ,0, 0);
360
+
361
+ // Save RGB data to output file
362
+ for(y=0;y<height;y++) {
363
+ for(x=0;x<width;x++) {
364
+ fputc(image[x][y],fd_out);
365
+ fputc(image[x][y],fd_out);
366
+ fputc(image[x][y],fd_out);
367
+ }
368
+ int over = width%4;
369
+ if (over != 0) {
370
+ for(j=0;j<over;j++) {
371
+ fputc(0,fd_out);
372
+ }
373
+ }
374
+ }
375
+
376
+ // Clean up
377
+ fclose(fd_out);
378
+ }
379
+
380
+ //########################################################################
381
+ //### Function to load BMP data from disk
382
+ //########################################################################
383
+
384
+ unsigned char ** LoadBMPFile(int *width, int *height)
385
+ {
386
+ // Variable declarations
387
+ short type;
388
+ int temp;
389
+ BMPHeader hdr;
390
+ BMPInfoHeader infoHdr;
391
+ FILE *fd;
392
+ int i, y, x;
393
+
394
+ // Open the file stream
395
+ fd = fopen("../../../input.bmp","rb");
396
+
397
+ // Open the file and scan the contents
398
+ if(!(fd)) { printf("***BMP load error: file access denied***\n"); exit(0); }
399
+ temp = fread(&type, sizeof(short), 1, fd);
400
+ temp = fread(&hdr, sizeof(hdr), 1, fd);
401
+ if(type != 0x4D42) { printf("***BMP load error: bad file format***\n"); exit(0); }
402
+ temp = fread(&infoHdr, sizeof(infoHdr), 1, fd);
403
+ if((infoHdr.planesBitsPerPixel>>16) != 24) { printf("***BMP load error: invalid color depth (%d)*** \n",(infoHdr.planesBitsPerPixel>>16)); exit(0); }
404
+ if(infoHdr.compression) { printf("***BMP load error: compressed image***\n"); exit(0); }
405
+ (*width) = infoHdr.width;
406
+ (*height) = infoHdr.height;
407
+
408
+ // Allocate memory to store the BMP's contents
409
+ unsigned char ** image = (unsigned char **)malloc((*width) * sizeof(*image));
410
+ unsigned char * image_1D = (unsigned char *)malloc((*width) * (*height) * sizeof(unsigned char));
411
+ for(i=0; i<(*width); i++) {
412
+ image[i] = &image_1D[i*(*height)];
413
+ }
414
+
415
+ // Read the BMP file and store the contents
416
+ fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR);
417
+ for(y = 0; y < (*height); y++) {
418
+ for(x = 0; x < (*width); x++) {
419
+ image[x][y] = ((int)fgetc(fd));
420
+ fgetc(fd);
421
+ fgetc(fd);
422
+ }
423
+ int over = (4 - ((*width)*3) % 4) % 4;
424
+ if (over != 0) {
425
+ for(x = 0; x < over; x++) {
426
+ fgetc(fd);
427
+ }
428
+ }
429
+ }
430
+
431
+ // Exit the function and clean-up
432
+ if(ferror(fd)) {
433
+ printf("***Unknown BMP load error.***\n");
434
+ free(image[0]);
435
+ free(image);
436
+ exit(0);
437
+ }
438
+ fclose(fd);
439
+ return image;
440
+ }
441
+
442
+ //########################################################################
443
+ //### Find the center of a projection vector (using a state machine)
444
+ //########################################################################
445
+ void CPU_FindCenters(int* vector, int *coordinates, int size) {
446
+ int s;
447
+ int state = 0;
448
+ int count = 0;
449
+ int coordinate = 0;
450
+ for (s=0;s<size;s++) {
451
+ if (state == 0) { // Last thing I found was a zero
452
+ if (vector[s] == 255) { // I found a 255 now
453
+ state = 1;
454
+ count = 0;
455
+ }
456
+ }
457
+ if (state == 1) { // Last thing I found was 255
458
+ if (vector[s] == 0) { // I found a zero now
459
+ state = 0;
460
+ if (count > 4) { // To filter out noise
461
+ coordinates[coordinate] = s-(count/2);
462
+ coordinate++;
463
+ }
464
+ }
465
+ else { // I found a 255 again
466
+ count++;
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ //########################################################################
473
+ //### CPU kernel to visualize the results
474
+ //########################################################################
475
+ void CPU_Visualize(unsigned char** image0, int* Xcoordinates, int* Ycoordinates, unsigned char **image3, int width, int height) {
476
+
477
+ // Loop variables
478
+ int h, w, x, y;
479
+
480
+ // Copy the whole image
481
+ for (h=0;h<height;h++) {
482
+ for (w=0;w<width;w++) {
483
+ unsigned char value = image0[h][w];
484
+ image3[h][w] = value;
485
+ }
486
+ }
487
+
488
+ // Replace the centers with white pixels
489
+ for (x=0;x<XVECTORS;x++) {
490
+ for (y=0;y<YVECTORS;y++) {
491
+ image3[Xcoordinates[x]][Ycoordinates[y]] = 255;
492
+ }
493
+ }
494
+ }
495
+
496
+ //########################################################################
497
+ //### CPU kernel function for between class variance (BCV), part of Otsu thresholding
498
+ //########################################################################
499
+ void CPU_BCV(int *histogram, float *BCVtable, int size) {
500
+ int i;
501
+
502
+ // Initialize the BCV table to zero
503
+ for (i=0;i<256;i++) {
504
+ BCVtable[i] = 0;
505
+ }
506
+
507
+ // Pre-calculated the total of the weigthed sums
508
+ int wsumtotal = 0;
509
+ for (i=0;i<256;i++) {
510
+ wsumtotal = wsumtotal + i*histogram[i];
511
+ }
512
+
513
+ // Set the initial values
514
+ int sumb = 0;
515
+ int sumf = size;
516
+ int wsumb = 0;
517
+ int wsumf = wsumtotal;
518
+
519
+ float wb;
520
+ float wf;
521
+ float meanb;
522
+ float meanf;
523
+
524
+ // Iterate over all possible threshold values
525
+ for (i=0;i<256;i++) {
526
+
527
+ // Update the weighted sums
528
+ wsumb = wsumb + i*histogram[i];
529
+ wsumf = wsumtotal - wsumb;
530
+
531
+ // Calculate the necessary components
532
+ wb = sumb / (float)size;
533
+ wf = sumf / (float)size;
534
+ meanb = wsumb / (float)sumb;
535
+ meanf = wsumf / (float)sumf;
536
+
537
+ // Stop if the sum of foreground is equal to zero
538
+ if (sumf == 0) { break; }
539
+
540
+ // Output the BCV value
541
+ BCVtable[i] = wb*wf*(meanb-meanf)*(meanb-meanf);
542
+
543
+ // If the sum of the background was equal to zero, BCV table will be NaN and must be set to zero
544
+ if (sumb == 0) { BCVtable[i] = 0; }
545
+
546
+ // Update the sum of the background (all darker pixels compared to the current pixel) and foreground pixels (the rest)
547
+ sumb = sumb + histogram[i];
548
+ sumf = size - sumb;
549
+ }
550
+ }
551
+
552
+ //########################################################################