bones-compiler 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
data/CHANGELOG
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
###################
|
2
|
+
### v1.1 ###
|
3
|
+
###################
|
4
|
+
|
5
|
+
General:
|
6
|
+
- Added support for a compiler optimisation pass: array substitution with a local register copy in the case of chunk to element species.
|
7
|
+
- Added support for a compiler optimisation pass: thread-merging, potentially improving re-use through locality at the cost of parallelism.
|
8
|
+
- Updated and added examples.
|
9
|
+
- Tuned the skeletons.
|
10
|
+
|
11
|
+
Skeletons performance/readability/usability tuning:
|
12
|
+
- Added a special GPU-CUDA 'chunk(1,N)' skeleton with pre-shuffling (e.g. atax, bicg, mvt, syrk). Added support for a corresponding transformation in Bones.
|
13
|
+
- Added a special GPU-CUDA 2x'chunk(1,N)' skeleton with pre-shuffling if there are two chunk inputs (e.g. gesummv,syr2k).
|
14
|
+
- Updated the various OpenCL skeletons to create only once the 'context', 'command queue' and 'program'. This saves significant time for programs executing a single kernel multiple times or multiple kernels subsequently.
|
15
|
+
- Tune GPU-CUDA skeletons to prefer L1 cache above scratchpad memory.
|
16
|
+
|
17
|
+
Examples:
|
18
|
+
- Updated several benchmarks with additional species.
|
19
|
+
- Added the 'saxpy' benchmark.
|
20
|
+
- Added 'example12' to demonstrate the use of species in a separate function.
|
21
|
+
|
22
|
+
Bug fixes:
|
23
|
+
- Removed the generation of global code in case no species are detected.
|
24
|
+
|
25
|
+
Miscellaneous:
|
26
|
+
- Renamed the original main function and created a new 'main' in 'common/globals.c' with initialisation and clean-up. The use of '#pragma species initialize' is now deprecated, but a main function is required in the original code.
|
27
|
+
- Added the identification of a species' function, making variable definition detection local.
|
28
|
+
- Improved the compiler run-time for algorithms for which a skeleton is unavailable.
|
29
|
+
- Added a command-line option to be able to generate code for a single species only (-only_alg_number).
|
30
|
+
- Added a command-line option to set a fixed thread-merge factor (-merge_factor).
|
31
|
+
- Added a gemspec file.
|
32
|
+
- Updated the documentation and readme.
|
33
|
+
|
34
|
+
###################
|
35
|
+
### v1.0 ###
|
36
|
+
###################
|
37
|
+
|
38
|
+
General:
|
39
|
+
- Added a benchmark set to the examples directory based on the PolyBench/C benchmark set.
|
40
|
+
- Performed major code refactoring, improving maintainability and performance of Bones.
|
41
|
+
- Updated all examples to work with the upcoming 'automatic species extraction tool' (ASET).
|
42
|
+
|
43
|
+
Skeletons performance/readability/usability tuning:
|
44
|
+
- Updated the 'GPU-CUDA' reduction skeleton to support initial values, loops not starting at zero or not a power of 2, and more (see the new 'shared/example5.c').
|
45
|
+
- Improved the way skeletons are matched with species. Introduced a 'default' skeleton and modified others to accomodate these changes.
|
46
|
+
|
47
|
+
Improved code/species support:
|
48
|
+
- Added atomic support for OpenCL based targets
|
49
|
+
- Added support for loop iterator variables inside the 'algorithmic species' with selective memory copy for the 'GPU-CUDA' target, see 'element/example11.c'.
|
50
|
+
|
51
|
+
Bug fixes:
|
52
|
+
- Fixed a compatibility problem with CAST version 0.2.0.
|
53
|
+
|
54
|
+
Miscellaneous:
|
55
|
+
- Added proper error messaging to catch various exceptions
|
56
|
+
- Reversed the order of loop-flattening to obtain coalesced memory accesses (in particular for the 'GPU-CUDA' target).
|
57
|
+
- Renamed 'tile' into 'chunk', since 'tile' might imply using a 2D chunk of data.
|
58
|
+
- Changed '#pragma bones' into '#pragma species' to match the algorithmic species naming.
|
59
|
+
- Added a warning message for negative or zero range dimensions.
|
60
|
+
- Organized the search-and-replace parameters found in skeletons, renamed most of them and added a few new.
|
61
|
+
- Added headers to the code examples and cleaned them up.
|
62
|
+
- Added a warning message if an 'endkernel' pragma is missing.
|
63
|
+
- Improved the way error messages are thrown.
|
64
|
+
|
65
|
+
###################
|
66
|
+
### v0.9 (Beta) ###
|
67
|
+
###################
|
68
|
+
|
69
|
+
General:
|
70
|
+
- Implemented the new 'algorithmic species' using ranges, thus creating support for for-loops with affine functions as loop-bounds.
|
71
|
+
- Updated the examples
|
72
|
+
|
73
|
+
Supported targets:
|
74
|
+
- Changed the names of the targets 'GPU-OPENCL' and 'CPU-OPENCL' into 'GPU-OPENCL-AMD' and 'CPU-OPENCL-INTEL' respectively.
|
75
|
+
- Added a new target 'CPU-C' which implements a C-to-C pass-through.
|
76
|
+
- Added a new target 'CPU-OPENMP', which uses OpenMP to create 4 CPU threads.
|
77
|
+
- Added a new target 'CPU-OPENCL-AMD'. This target is similar to 'CPU-OPENCL-INTEL', but targets the AMD APP.
|
78
|
+
|
79
|
+
Skeletons performance/readability/usability tuning:
|
80
|
+
- Added a basic prefetching technique in the local memory for the neighbourhood skeleton for the 'GPU-CUDA' target.
|
81
|
+
- Removed the first entry in the transformation settings, which was used in previous versions to set the dimensions (now automatically detected).
|
82
|
+
- Changed the 'GPU-CUDA' skeletons such that host files can be compiled with a C99 compiler.
|
83
|
+
- Tuned 'CPU-OPENCL-INTEL' performance for Intel's OpenCL SDK.
|
84
|
+
- Created aligned memory allocation functions to enable zero-copy possibilities for the 'CPU-OPENCL-INTEL' target.
|
85
|
+
- Completed the addition of 'bones_' for every variable in the skeletons.
|
86
|
+
|
87
|
+
Verification code:
|
88
|
+
- Create a verification function specific to each output.
|
89
|
+
- Moved the verification code (including the original code) to a separate file, which is only generated if '-c' is provided as a flag to Bones.
|
90
|
+
|
91
|
+
Bug fixes:
|
92
|
+
- Fixed a bug which would create function names starting with a digit.
|
93
|
+
- Adjusted the use of directory structures in the code for Windows-compatibility.
|
94
|
+
- Fixed a bug where variables for verification would have duplicate names.
|
95
|
+
- Fixed a bug where verification code would not compile for 'unsigned int' types.
|
96
|
+
- Fixed a memory leak which would occur when verification is enabled.
|
97
|
+
- Fixed a bug where statements of the form 'a[i]++' would not be recognized as input nor as output. They are now rewritten as 'a[i]=a[i]+1'.
|
98
|
+
|
99
|
+
Miscellaneous:
|
100
|
+
- Added support for selective copying-out (based on array access ranges)
|
101
|
+
- Added support for defines found in header functions (pre-processor now also pre-processes the header files)
|
102
|
+
- Added the possibility to specify the order of inputs/outputs in the classification by giving their names (if not given, the default ordering is assumed).
|
103
|
+
- Writing to a specific location in an array followed by a read no longer considers the array as input and output, it is now output only.
|
104
|
+
- Added a check to see if for-loops start and end as expected (as provided by the ranges given through the 'algorithmic species').
|
105
|
+
- Create a 'simplify' function, which simplifies math expressions to a certain extend. A test is included to give a few examples of what it can do.
|
106
|
+
- Clean-up of the Rakefile, addition of stub tasks to compile and execute example code, and the addition of an 'add new target' task.
|
107
|
+
- Changed the code such that the core components of Bones (the 'lib' folder) do not have to be adjusted to add a new target.
|
108
|
+
- Added performance measurement for original code in case verification is enabled.
|
109
|
+
- Renamed 'Tribe' into 'Species' and 'Primitive' into 'Algorithm'.
|
110
|
+
|
111
|
+
###################
|
112
|
+
### v0.8 (Beta) ###
|
113
|
+
###################
|
114
|
+
|
115
|
+
Initial release.
|
116
|
+
|
117
|
+
###################
|
data/LICENSE
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Copyright (c) 2012 Cedric Nugteren, Eindhoven University of Technology, The Netherlands
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
|
+
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
8
|
+
|
9
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
= Bones
|
2
|
+
Recent advances in multi-core and many-core processors requires programmers to exploit an increasing amount of parallelism from their applications. Data parallel languages such as CUDA and OpenCL make it possible to take advantage of such processors, but still require a large amount of effort from programmers. To address the challenge of parallel programming, we introduce Bones.
|
3
|
+
|
4
|
+
Bones is a source-to-source compiler based on algorithmic skeletons and a new algorithm classification (named 'algorithmic species'). The compiler takes C-code annotated with class information as input and generates parallelized target code. Targets include NVIDIA GPUs (through CUDA), AMD GPUs (through OpenCL) and CPUs (through OpenCL and OpenMP). Bones is open-source, written in the Ruby programming language, and is available through our website. The compiler is based on the C-parser CAST, which is used to parse the input code into an abstract syntax tree (AST) and to generate the target code from a transformed AST.
|
5
|
+
|
6
|
+
== Usage
|
7
|
+
The usage is as follows:
|
8
|
+
bones --application <input> --target <target> [OPTIONS]
|
9
|
+
With the following flags:
|
10
|
+
--application, -a <s>: Input application file
|
11
|
+
--target, -t <s>: Target processor (choose from: GPU-CUDA, GPU-OPENCL-AMD,
|
12
|
+
CPU-OPENCL-INTEL, CPU-OPENCL-AMD, CPU-OPENMP, CPU-C)
|
13
|
+
--measurements, -m: Enable/disable timers
|
14
|
+
--verify, -c: Verify correctness of the generated code
|
15
|
+
--version, -v: Print version and exit
|
16
|
+
--help, -h: Show this message
|
17
|
+
|
18
|
+
Bones can be invoked from the command-line. Two arguments (-a and -t) are mandatory, others are optional. This is an example of the usage of Bones assuming the file '+example.c+' to be present:
|
19
|
+
bones -a example.c -t GPU-CUDA -c
|
20
|
+
|
21
|
+
== Examples
|
22
|
+
The best place to start experimenting with Bones is the '+examples+' directory. A large number of examples are available in this folder, grouped by algorithmic species (either element, neighbourhood, shared or chunk). The examples illustrate different kinds of coding styles and give a large number of different classes to work with. The folder '+benchmarks+' gives more examples, taken from the PolyBench/C benchmark set. Additionally, a folder '+applications+' is included, containing example complete applications. Currently, the following examples are available:
|
23
|
+
|-- element |-- applications
|
24
|
+
| |-- example1.c | \-- ffos.c
|
25
|
+
| |-- example2.c \-- benchmarks
|
26
|
+
| |-- example3.c |-- 2mm.c
|
27
|
+
| |-- example4.c |-- 3mm.c
|
28
|
+
| |-- example5.c |-- adi.c
|
29
|
+
| |-- example6.c |-- atax.c
|
30
|
+
| |-- example7.c |-- bicg.c
|
31
|
+
| |-- example8.c |-- cholesky.c
|
32
|
+
| |-- example9.c |-- correlation.c
|
33
|
+
| |-- example10.c |-- covariance.c
|
34
|
+
| |-- example11.c |-- doitgen.c
|
35
|
+
| \-- example12.c |-- durbin.c
|
36
|
+
|-- neighbourhood |-- dynprog.c
|
37
|
+
| |-- example1.c |-- fdtd-2d-apml.c
|
38
|
+
| |-- example2.c |-- fdtd-2d.c
|
39
|
+
| |-- example3.c |-- floyd-warshall.c
|
40
|
+
| \-- example4.c |-- gemm.c
|
41
|
+
|-- shared |-- gemver.c
|
42
|
+
| |-- example1.c |-- gesummv.c
|
43
|
+
| |-- example2.c |-- jacobi-1d-imper.c
|
44
|
+
| |-- example3.c |-- jacobi-2d-imper.c
|
45
|
+
| |-- example4.c |-- lu.c
|
46
|
+
| \-- example5.c |-- ludcmp.c
|
47
|
+
|-- chunk |-- mvt.c
|
48
|
+
| |-- example1.c |-- reg_detect.c
|
49
|
+
| |-- example2.c |-- saxpy.c
|
50
|
+
| |-- example3.c |-- seidel-2d.c
|
51
|
+
| |-- example4.c |-- syr2k.c
|
52
|
+
| \-- example5.c |-- syrk.c
|
53
|
+
|-- trisolv.c
|
54
|
+
\-- trmm.c
|
55
|
+
|
56
|
+
All examples can be ran through Bones for a specific target using an automated Rake task. Executing '<tt>rake examples:generate</tt>' or simply '+rake+' will execute Bones for all examples for a given target. The target can be changed in the '+Rakefile+' found in the root directory of Bones.
|
57
|
+
|
58
|
+
== Limitations
|
59
|
+
Bones takes C99 source code as input. However, several coding styles are unsupported as of now or might yield worse performance compared to others. The numerous examples provided should give the user an idea of the possibilities and limitations of the tool. A complete list of coding guidelines and limitations will follow in the future. Currently, an initial list of major limitations and guidelines is given below. In this list, we use '+algorithm+' to denote an algorithm captured by an algorithmic species.
|
60
|
+
* If the algorithm works on a N-dimensional data structure, use N-dimensional arrays (don't flatten it yourself, e.g. use '<tt>example[i][j]</tt>' instead of '<tt>example[i+j*A]</tt>') and specify an N-dimensional algorithmic species.
|
61
|
+
* Write your while-loops as for-loops if possible. For-loops should have a unit increment, other loops (e.g. decrementing loops) must be re-written.
|
62
|
+
* Loops can have affine bounds containing constants, defines and variables. Variables should not include loop variables of loops that are part of the '+algorithm+'.
|
63
|
+
* Function calls are not allowed within the '+algorithm+'. Some mathematical functions are allowed.
|
64
|
+
* Variables are allowed in the definition of an algorithmic species. If they are used, the should also be used somewhere in the body of the '+algorithm+'.
|
65
|
+
* Bones is designed to work on a single input file with at least a function called 'main'. If your (to-be-accelerated) code spawns over multiple C-files, Bones could either be applied multiple times, or the code could be merged into a single file.
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
= Installation procedure
|
70
|
+
Installation of Bones is a simple matter of extracting the Bones package to a directory of your choice or installing the gem ('<tt>gem install bones-compiler</tt>'). However, there are a number of prerequisites.
|
71
|
+
|
72
|
+
== Prerequisites
|
73
|
+
Bones requires the installation of Ruby, the Rubygems gem package manager and two gems:
|
74
|
+
1. Any version of *Ruby* *1.8* or *1.9*. Information on Ruby is found at http://www.ruby-lang.org
|
75
|
+
* [OS X]: Ruby is pre-installed on any OS X system since Tiger (10.4).
|
76
|
+
* [Linux]: Ruby is pre-installed on some Linux based systems. Most Linux package managers (yum, apt-get) will be able to provide a Ruby installation. Make sure that the ruby development package ('+ruby-devel+') is also installed, as it is required by one of the gems.
|
77
|
+
* [Windows]: Ruby for Windows can be obtained from http://rubyinstaller.org/
|
78
|
+
2. The *Rubygems* gem package manager. Information on Rubygems can be found at http://rubygems.org
|
79
|
+
* [OS X]: Rubygems is pre-installed on any OS X system since Tiger (10.4).
|
80
|
+
* [Linux]: Most Linux package managers will be able to provide a Rubygems installation by installing the package '+rubygems+'.
|
81
|
+
* [Windows]: Rubygems for Windows is obtained automatically when installing from http://rubyinstaller.org/
|
82
|
+
3. Bones requires two gems, *trollop* and *cast*. Both gems can be installed by calling Rubygems from the command line, i.e.: '<tt>gem install trollop cast</tt>'.
|
83
|
+
|
84
|
+
For example, all prerequisites can be installed as follows on a Fedora, Red-Hat or CentOS system:
|
85
|
+
yum install ruby ruby-devel rubygems
|
86
|
+
gem install trollop cast
|
87
|
+
For an Ubuntu, Debian or Mint system, the equivalent commands are:
|
88
|
+
apt-get install ruby ruby-devel rubygems
|
89
|
+
gem install trollop cast
|
90
|
+
|
91
|
+
== Installing Bones
|
92
|
+
To install the compiler, simply extract the '<tt>bones\_x.x.tar.gz</tt>' package to a directory of your choice. The Bones executable is found in the '+bin+' subdirectory. Including the path to the '+bin+' directory to your environmental variable '+PATH+' will make Bones available from any directory on your machine. Starting at version 1.1, Bones is also available as a gem ('<tt>gem install bones-compiler</tt>').
|
93
|
+
|
94
|
+
|
95
|
+
= Documentation
|
96
|
+
There are two ways to go to obtain more information regarding Bones. To obtain more information about the compiler itself, the ideas behind it and the algorithm classification, it is a good idea to read scientific publications. To get more information about the code structure, HTML documentation can be generated automatically using RDoc.
|
97
|
+
|
98
|
+
== Code documentation
|
99
|
+
Code documentation can be generated automatically using RDoc. Navigate to the installation root of Bones and use Rake to generate documentation: '<tt>rake rdoc</tt>'. More information on using Rake is provided later in this document. Next, open '<tt>rdoc/index.html</tt>' to navigate through the documentation. The same documentation is also available on the web at http://parse.ele.tue.nl/tools/bones/rdoc/.
|
100
|
+
|
101
|
+
== Scientific publications
|
102
|
+
Scientific publications related to Bones can be obtained from http://parse.ele.tue.nl/publications. Two publications are relevant:
|
103
|
+
1. <b>A Modular and Parameterisable Classification of Algorithms</b>, which provides details on the used algorithm classification. When refering to the algorithm classification in scientific work, you are kindly asked to include the following citations:
|
104
|
+
|
105
|
+
@TECHREPORT{Nugteren2011,
|
106
|
+
author = {Cedric Nugteren and Henk Corporaal},
|
107
|
+
title = {{A Modular and Parameterisable Classification of Algorithms}},
|
108
|
+
institution = {Eindhoven University of Technology},
|
109
|
+
year = {2011},
|
110
|
+
number = {No. ESR-2011-02},
|
111
|
+
}
|
112
|
+
2. <b>Introducing 'Bones': A Parallelizing Source-to-Source Compiler Based on Algorithmic Skeletons</b>, which introduces the Bones source-to-source compiler. When refering to Bones in scientific work, you are kindly asked to include the following citations:
|
113
|
+
|
114
|
+
@INPROCEEDINGS{Nugteren2012,
|
115
|
+
author = {Cedric Nugteren and Henk Corporaal},
|
116
|
+
title = {{Introducing `Bones': A Parallelizing Source-to-Source Compiler
|
117
|
+
Based on Algorithmic Skeletons}},
|
118
|
+
booktitle = {{GPGPU-5: 5th Workshop on General Purpose Processing on
|
119
|
+
Graphics Processing Units}},
|
120
|
+
year = {2012},
|
121
|
+
}
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
= Questions
|
126
|
+
Questions can be directed by email. You can find contact details on the personal page of the author at http://parse.ele.tue.nl/cnugteren or on the project page at github.
|
data/Rakefile
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
require 'rdoc/task'
|
3
|
+
require 'rake/clean'
|
4
|
+
|
5
|
+
# Set the location of the examples
|
6
|
+
EXAMPLES = File.join('examples','benchmarks','*.c')
|
7
|
+
|
8
|
+
# Set the clean/clobber tasks
|
9
|
+
CLOBBER.include(Dir[File.join('examples','*','*_*-*')])
|
10
|
+
|
11
|
+
# Pick a target from a list of possible targets
|
12
|
+
# 0 1 2 3 4 5
|
13
|
+
TARGETS = ['GPU-CUDA','GPU-OPENCL-AMD','CPU-OPENCL-INTEL','CPU-OPENCL-AMD','CPU-OPENMP','CPU-C']
|
14
|
+
TARGET = TARGETS[0]
|
15
|
+
|
16
|
+
# Settings for Bones
|
17
|
+
MEASUREMENTS = true
|
18
|
+
VERIFICATION = true
|
19
|
+
|
20
|
+
# Small helper function to display text on screen
|
21
|
+
def display(text)
|
22
|
+
print '[Rake] ### '+text+': '
|
23
|
+
p
|
24
|
+
end
|
25
|
+
|
26
|
+
# Set the default task
|
27
|
+
task :default => [:examples]
|
28
|
+
|
29
|
+
# Rake tasks related to the examples
|
30
|
+
namespace :examples do
|
31
|
+
|
32
|
+
# Task to process and test everything (generating code, compiling code, executing)
|
33
|
+
desc 'Run all examples through Bones, compile them, and execute them'
|
34
|
+
task :verify, [:file] => [:generate, :compile, :execute] do |t, args|
|
35
|
+
end
|
36
|
+
|
37
|
+
# Task to pass examples through Bones
|
38
|
+
desc 'Generate target code using Bones'
|
39
|
+
task :generate, :file do |t, args|
|
40
|
+
args.with_defaults(:file => EXAMPLES)
|
41
|
+
Dir[args.file].sort.each do |file|
|
42
|
+
display('Generating')
|
43
|
+
options = (MEASUREMENTS ? '-m ' : '') + (VERIFICATION ? '-c ' : '')
|
44
|
+
sh "bin/bones -a #{file} -t #{TARGET} #{options}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Task to compile the generated code for the examples (NOTE: this task is a stub)
|
49
|
+
desc 'Compile all examples (using gcc/nvcc)'
|
50
|
+
task :compile, :file do |t, args|
|
51
|
+
args.with_defaults(:file => EXAMPLES)
|
52
|
+
Dir[args.file].sort.each do |file|
|
53
|
+
compile(file,TARGET)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Task to execute the compiled code for the examples (NOTE: this task is a stub)
|
58
|
+
desc 'Execute all examples'
|
59
|
+
task :execute, :file do |t, args|
|
60
|
+
args.with_defaults(:file => EXAMPLES)
|
61
|
+
Dir[args.file].sort.each do |file|
|
62
|
+
execute(file,TARGET)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Helper function to compile code
|
67
|
+
#def compile(file,target)
|
68
|
+
# (system-specific, to be filled in by the user)
|
69
|
+
#end
|
70
|
+
|
71
|
+
# Helper function to execute code
|
72
|
+
#def execute(file,target)
|
73
|
+
# (system-specific, to be filled in by the user)
|
74
|
+
#end
|
75
|
+
|
76
|
+
end
|
77
|
+
task :examples => ['examples:generate']
|
78
|
+
|
79
|
+
# Task which adds a new target to the skeleton library based on an existing target
|
80
|
+
desc 'Adds a new target to the skeleton library'
|
81
|
+
task :add_target, :name, :base do |t, args|
|
82
|
+
args.with_defaults(:name => 'NEW-TARGET', :base => 'CPU-OPENMP')
|
83
|
+
base = 'skeletons/'+args.base
|
84
|
+
name = 'skeletons/'+args.name
|
85
|
+
if File.exists?(base) && !File.exists?(name)
|
86
|
+
sh "cp -r #{base} #{name}"
|
87
|
+
else
|
88
|
+
puts '[Rake] ### Error adding new target'
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Test individual parts of the code
|
93
|
+
Rake::TestTask.new do |test|
|
94
|
+
test.test_files = FileList[File.join('test','*','test_*.rb')]
|
95
|
+
test.verbose = false
|
96
|
+
end
|
97
|
+
|
98
|
+
# Generate HTML documentation using RDoc
|
99
|
+
RDoc::Task.new do |rdoc|
|
100
|
+
rdoc.title = 'Bones'
|
101
|
+
rdoc.options << '--line-numbers'
|
102
|
+
rdoc.rdoc_files.include(File.join('lib','**','*.rb'))
|
103
|
+
rdoc.rdoc_files.include('README.rdoc')
|
104
|
+
rdoc.rdoc_dir = 'rdoc'
|
105
|
+
rdoc.main = 'README.rdoc'
|
106
|
+
end
|
107
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.1.0
|
data/bin/bones
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Set the path for the libraries and the installation directory
|
4
|
+
BONES_DIR = File.dirname(__FILE__) + '/../'
|
5
|
+
lib_dir = File.join(BONES_DIR,'lib')
|
6
|
+
$LOAD_PATH.unshift lib_dir unless $LOAD_PATH.include?(lib_dir)
|
7
|
+
|
8
|
+
# Enable verbose output
|
9
|
+
VERBOSE = true
|
10
|
+
|
11
|
+
# Load the libraries
|
12
|
+
require 'castaddon.rb'
|
13
|
+
require 'bones.rb'
|
14
|
+
|
15
|
+
# Run the main function
|
16
|
+
bones = Bones::Engine.new
|
17
|
+
bones.process
|
18
|
+
bones.write_output
|
19
|
+
|
20
|
+
|
@@ -0,0 +1,552 @@
|
|
1
|
+
//
|
2
|
+
// This file is part of the Bones source-to-source compiler examples. This C-code
|
3
|
+
// demonstrates the use of Bones for an example (real) application: 'Fast Focus
|
4
|
+
// on Structures' (FFOS). For more information on the application or on Bones
|
5
|
+
// please use the contact information below.
|
6
|
+
//
|
7
|
+
// == More information on the FFOS application
|
8
|
+
// Contact............Yifan He / Zhenyu Ye
|
9
|
+
// Web address........http://zhenyu-ye.net/publications/acivs2011/yifan2011acivs.pdf
|
10
|
+
//
|
11
|
+
// == More information on Bones
|
12
|
+
// Contact............Cedric Nugteren <c.nugteren@tue.nl>
|
13
|
+
// Web address........http://parse.ele.tue.nl/bones/
|
14
|
+
//
|
15
|
+
// == File information
|
16
|
+
// Filename...........applications/ffos.c
|
17
|
+
// Author.............Cedric Nugteren
|
18
|
+
// Last modified on...22-May-2012
|
19
|
+
//
|
20
|
+
|
21
|
+
//########################################################################
|
22
|
+
//### Includes
|
23
|
+
//########################################################################
|
24
|
+
|
25
|
+
#include <stdio.h>
|
26
|
+
#include <stdlib.h>
|
27
|
+
#include <math.h>
|
28
|
+
#include <time.h>
|
29
|
+
|
30
|
+
//########################################################################
|
31
|
+
//### Defines
|
32
|
+
//########################################################################
|
33
|
+
|
34
|
+
#define XVECTORS 10
|
35
|
+
#define YVECTORS 10
|
36
|
+
|
37
|
+
//########################################################################
|
38
|
+
//### Forward declarations
|
39
|
+
//########################################################################
|
40
|
+
|
41
|
+
void SaveBMPFile(unsigned char ** image, const char * outputdestination, int width, int height);
|
42
|
+
unsigned char ** LoadBMPFile(int *width, int *height);
|
43
|
+
void CPU_FindCenters(int* vector, int *coordinates, int size);
|
44
|
+
void CPU_Visualize(unsigned char** image0, int* Xcoordinates, int* Ycoordinates, unsigned char **image3, int width, int height);
|
45
|
+
void CPU_BCV(int *histogram, float *BCVtable, int size);
|
46
|
+
|
47
|
+
//########################################################################
|
48
|
+
//### Global variables
|
49
|
+
//########################################################################
|
50
|
+
|
51
|
+
int messages = 2;
|
52
|
+
|
53
|
+
//########################################################################
|
54
|
+
//### Start of the main function
|
55
|
+
//########################################################################
|
56
|
+
|
57
|
+
int main(void) {
|
58
|
+
|
59
|
+
// Declare loop variables
|
60
|
+
int i,h,w,a;
|
61
|
+
|
62
|
+
// Set other variables
|
63
|
+
int threshold = 0;
|
64
|
+
int hist[256];
|
65
|
+
for (i=0;i<256;i++) { hist[i] = 0; }
|
66
|
+
float * BCVtable = (float *)malloc(256*sizeof(float));
|
67
|
+
|
68
|
+
// Loading image0 from disk
|
69
|
+
if (messages == 2) { printf("### Loading image0 from disk.\n"); }
|
70
|
+
int width = 0;
|
71
|
+
int height = 0;
|
72
|
+
unsigned char ** image0 = LoadBMPFile(&width, &height);
|
73
|
+
|
74
|
+
// Create space for image1
|
75
|
+
if (messages == 2) { printf("### Allocating space for image1.\n"); }
|
76
|
+
unsigned char ** image1 = (unsigned char **)malloc(width*sizeof(*image1));
|
77
|
+
unsigned char * image1_1D = (unsigned char *)malloc(width*height*sizeof(unsigned char));
|
78
|
+
for(i=0;i<width;i++) { image1[i] = &image1_1D[i*height]; }
|
79
|
+
|
80
|
+
// Create space for image2
|
81
|
+
if (messages == 2) { printf("### Allocating space for image2.\n"); }
|
82
|
+
unsigned char ** image2 = (unsigned char **)malloc(width*sizeof(*image2));
|
83
|
+
unsigned char * image2_1D = (unsigned char *)malloc(width*height*sizeof(unsigned char));
|
84
|
+
for(i=0;i<width;i++) { image2[i] = &image2_1D[i*height]; }
|
85
|
+
|
86
|
+
// Create space for image3
|
87
|
+
if (messages == 2) { printf("### Allocating space for image3.\n"); }
|
88
|
+
unsigned char ** image3 = (unsigned char **)malloc(width*sizeof(*image3));
|
89
|
+
unsigned char * image3_1D = (unsigned char *)malloc(width*height*sizeof(unsigned char));
|
90
|
+
for(i=0;i<width;i++) { image3[i] = &image3_1D[i*height]; }
|
91
|
+
|
92
|
+
// Create space for projection vectors
|
93
|
+
if (messages == 2) { printf("### Allocating space for projection vectors.\n"); fflush(stdout); }
|
94
|
+
int * Xvector = (int *)malloc(width*sizeof(int));
|
95
|
+
int * Yvector = (int *)malloc(height*sizeof(int));
|
96
|
+
|
97
|
+
// Create coordinate arrays
|
98
|
+
if (messages == 2) { printf("### Allocating space for coordinate arrays.\n"); fflush(stdout); }
|
99
|
+
int Xcoordinates[XVECTORS]; for(i=0;i<XVECTORS;i++) { Xcoordinates[i] = 0; }
|
100
|
+
int Ycoordinates[YVECTORS]; for(i=0;i<YVECTORS;i++) { Ycoordinates[i] = 0; }
|
101
|
+
|
102
|
+
//########################################################################
|
103
|
+
//### PART1: Histogramming (accelerated)
|
104
|
+
//########################################################################
|
105
|
+
if (messages >= 1) { printf("### PART1: Histogramming.\n"); fflush(stdout); }
|
106
|
+
|
107
|
+
#pragma species kernel 0:height-1,0:width-1|element -> 0:255|shared
|
108
|
+
for (h=0;h<height;h++) {
|
109
|
+
for (w=0;w<width;w++) {
|
110
|
+
hist[image0[h][w]] = hist[image0[h][w]] + 1;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
#pragma species endkernel histogram
|
114
|
+
|
115
|
+
//########################################################################
|
116
|
+
//### Between class variance (CPU)
|
117
|
+
//########################################################################
|
118
|
+
if (messages == 2) { printf("### Create a between class variance table.\n"); fflush(stdout); }
|
119
|
+
CPU_BCV(hist, BCVtable, width*height);
|
120
|
+
|
121
|
+
//########################################################################
|
122
|
+
//### PART2: Search for the maximum (accelerated)
|
123
|
+
//########################################################################
|
124
|
+
if (messages >= 1) { printf("### PART2: Search for the maximum value.\n"); fflush(stdout); }
|
125
|
+
float maximum[1];
|
126
|
+
maximum[0] = 10;
|
127
|
+
int length = 256;
|
128
|
+
|
129
|
+
//#pragma species kernel 0:255|element -> 0:0|shared
|
130
|
+
for (i=0;i<length;i++) {
|
131
|
+
maximum[0] = (BCVtable[i] > maximum[0]) ? BCVtable[i] : maximum[0];
|
132
|
+
}
|
133
|
+
//#pragma species endkernel maximum_1
|
134
|
+
|
135
|
+
if (messages == 2) { printf("### Maximum is %.3lf.\n",maximum[0]); fflush(stdout); }
|
136
|
+
|
137
|
+
//########################################################################
|
138
|
+
//### PART3: Search for the maximum - larger synthetic example (accelerated)
|
139
|
+
//########################################################################
|
140
|
+
if (messages >= 1) { printf("### PART3: Search for the maximum value (synthetic example).\n"); fflush(stdout); }
|
141
|
+
int vector_size = 2097152; // 2048x1024
|
142
|
+
float* synthetic_vector = (float*)malloc(sizeof(float)*vector_size);
|
143
|
+
srand(time(NULL));for (i=0;i<vector_size;i++) { synthetic_vector[i] = (rand() % 7777777) / 1000.0; }
|
144
|
+
float result[1];
|
145
|
+
result[0] = 0;
|
146
|
+
|
147
|
+
//#pragma species kernel 0:2097151|element -> 0:0|shared
|
148
|
+
for (i=0;i<vector_size;i++) {
|
149
|
+
result[0] = (synthetic_vector[i] > result[0]) ? synthetic_vector[i] : result[0];
|
150
|
+
}
|
151
|
+
//#pragma species endkernel maximum_2
|
152
|
+
|
153
|
+
if (messages == 2) { printf("### Maximum is %.3lf.\n",result[0]); fflush(stdout); }
|
154
|
+
|
155
|
+
//########################################################################
|
156
|
+
//### Search for the index of the maximum (CPU)
|
157
|
+
//########################################################################
|
158
|
+
if (messages == 2) { printf("### Search for the index of the maximum value.\n"); fflush(stdout); }
|
159
|
+
for (i=0;i<256;i++) {
|
160
|
+
if (BCVtable[i] == maximum[0]) {
|
161
|
+
threshold = i;
|
162
|
+
break;
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
//########################################################################
|
167
|
+
//### PART4: Binarization (accelerated)
|
168
|
+
//########################################################################
|
169
|
+
if (messages >= 1) { printf("### PART4: Binarization with treshold at %d.\n",threshold); fflush(stdout); }
|
170
|
+
|
171
|
+
#pragma species kernel 0:height-1,0:width-1|element -> 0:height-1,0:width-1|element
|
172
|
+
for (h=0;h<height;h++) {
|
173
|
+
for (w=0;w<width;w++) {
|
174
|
+
if (image0[h][w] > threshold) { image1[h][w] = 1; }
|
175
|
+
else { image1[h][w] = 0; }
|
176
|
+
}
|
177
|
+
}
|
178
|
+
#pragma species endkernel threshold
|
179
|
+
|
180
|
+
//########################################################################
|
181
|
+
//### PART5: Erosion 7x7 (accelerated)
|
182
|
+
//########################################################################
|
183
|
+
if (messages >= 1) { printf("### PART5: Perform the erode kernel.\n"); fflush(stdout); }
|
184
|
+
|
185
|
+
int condition;
|
186
|
+
#pragma species kernel 7:height-8,7:width-8|neighbourhood(-3:3,-3:3) -> 0:height-1,0:width-1|element
|
187
|
+
for (h=0;h<height;h++) {
|
188
|
+
for (w=0;w<width;w++) {
|
189
|
+
if (w >= 7 && h >= 7 && w <= width-7 && h <= height-7) {
|
190
|
+
condition = 1;
|
191
|
+
for(a=-3;a<=3;a++) {
|
192
|
+
condition = condition
|
193
|
+
* image1[(h-3)][(w+a)]
|
194
|
+
* image1[(h-2)][(w+a)]
|
195
|
+
* image1[(h-1)][(w+a)]
|
196
|
+
* image1[(h+0)][(w+a)]
|
197
|
+
* image1[(h+1)][(w+a)]
|
198
|
+
* image1[(h+2)][(w+a)]
|
199
|
+
* image1[(h+3)][(w+a)]
|
200
|
+
;
|
201
|
+
}
|
202
|
+
if (condition == 1) { image2[h][w] = 255; }
|
203
|
+
else { image2[h][w] = 0; }
|
204
|
+
}
|
205
|
+
else {
|
206
|
+
image2[h][w] = 0;
|
207
|
+
}
|
208
|
+
}
|
209
|
+
}
|
210
|
+
#pragma species endkernel erosion
|
211
|
+
|
212
|
+
//########################################################################
|
213
|
+
//### PART6: 1D erosion(7) synthetic example (accelerated)
|
214
|
+
//########################################################################
|
215
|
+
if (messages >= 1) { printf("### PART6: Perform the erode kernel (1D - synthetic).\n"); fflush(stdout); }
|
216
|
+
int vector_size2 = 2097152; // 2048x1024
|
217
|
+
int* vector2a = (int*)malloc(sizeof(int)*vector_size2);
|
218
|
+
int* vector2b = (int*)malloc(sizeof(int)*vector_size2);
|
219
|
+
srand(time(NULL));
|
220
|
+
for (i=0;i<vector_size2;i++) {
|
221
|
+
if (rand()%15 > 1) { vector2a[i] = 1; }
|
222
|
+
else { vector2a[i] = 0; }
|
223
|
+
}
|
224
|
+
|
225
|
+
//#pragma species kernel 0:2097151|neighbourhood(-3:3) -> 0:2097151|element
|
226
|
+
for (i=0;i<vector_size2;i++) {
|
227
|
+
if (i >= 7 && i <= vector_size2-7) {
|
228
|
+
condition = 1;
|
229
|
+
for(a=-3;a<=3;a++) {
|
230
|
+
condition = condition * vector2a[i+a];
|
231
|
+
}
|
232
|
+
if (condition == 1) { vector2b[i] = 255; }
|
233
|
+
else { vector2b[i] = 0; }
|
234
|
+
}
|
235
|
+
else {
|
236
|
+
vector2b[i] = 0;
|
237
|
+
}
|
238
|
+
}
|
239
|
+
//#pragma species endkernel erosion1d
|
240
|
+
|
241
|
+
// Compute a gold reference
|
242
|
+
int gold = 0;
|
243
|
+
int gold_condition = 1;
|
244
|
+
for(a=-3;a<=3;a++) { gold_condition = gold_condition * vector2a[10+a]; }
|
245
|
+
if (gold_condition == 1) { gold = 255; }
|
246
|
+
if (messages == 2) { printf("### Result at index 10 is %d and should be %d.\n",vector2b[10],gold); fflush(stdout); }
|
247
|
+
|
248
|
+
//########################################################################
|
249
|
+
//### PART7: Y-projection (accelerated)
|
250
|
+
//########################################################################
|
251
|
+
if (messages >= 1) { printf("### PART7: Starting the Y-projection algorithm.\n"); fflush(stdout); }
|
252
|
+
|
253
|
+
int result_yp;
|
254
|
+
#pragma species kernel 0:height-1,0:width-1|chunk(0:height-1,0:0) -> 0:width-1|element
|
255
|
+
for (w=0;w<width;w++) {
|
256
|
+
result_yp = 0;
|
257
|
+
for (h=0;h<height;h++) {
|
258
|
+
if (image2[h][w] == 255) {
|
259
|
+
result_yp = 255;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
Yvector[w] = result_yp;
|
263
|
+
}
|
264
|
+
#pragma species endkernel y_projection
|
265
|
+
|
266
|
+
//########################################################################
|
267
|
+
//### PART8: X-projection (accelerated)
|
268
|
+
//########################################################################
|
269
|
+
if (messages >= 1) { printf("### PART8: Starting the X-projection algorithm.\n"); fflush(stdout); }
|
270
|
+
|
271
|
+
int result_xp;
|
272
|
+
#pragma species kernel 0:height-1,0:width-1|chunk(0:0,0:width-1) -> 0:height-1|element
|
273
|
+
for (h=0;h<height;h++) {
|
274
|
+
result_xp = 0;
|
275
|
+
for (w=0;w<width;w++) {
|
276
|
+
if (image2[h][w] == 255) {
|
277
|
+
result_xp = 255;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
Xvector[h] = result_xp;
|
281
|
+
}
|
282
|
+
#pragma species endkernel x_projection
|
283
|
+
|
284
|
+
//########################################################################
|
285
|
+
//### Search for the centers of the projection vectors (CPU)
|
286
|
+
//########################################################################
|
287
|
+
if (messages == 2) { printf("### Search for X- and Y-projection vectors.\n"); fflush(stdout); }
|
288
|
+
CPU_FindCenters(Xvector, Xcoordinates, width);
|
289
|
+
CPU_FindCenters(Yvector, Ycoordinates, height);
|
290
|
+
|
291
|
+
//########################################################################
|
292
|
+
//### Visualize, save to disk and finalize the program
|
293
|
+
//########################################################################
|
294
|
+
CPU_Visualize(image0, Xcoordinates, Ycoordinates, image3, width, height);
|
295
|
+
SaveBMPFile(image1, "output1.bmp", width, height);
|
296
|
+
SaveBMPFile(image2, "output2.bmp", width, height);
|
297
|
+
SaveBMPFile(image3, "output3.bmp", width, height);
|
298
|
+
free(image0);
|
299
|
+
free(image1);
|
300
|
+
free(image1_1D);
|
301
|
+
free(image2);
|
302
|
+
free(image2_1D);
|
303
|
+
free(image3);
|
304
|
+
free(image3_1D);
|
305
|
+
free(Xvector);
|
306
|
+
free(Yvector);
|
307
|
+
free(BCVtable);
|
308
|
+
if (messages == 2) { printf("### End of program\n"); fflush(stdout); }
|
309
|
+
return 0;
|
310
|
+
}
|
311
|
+
|
312
|
+
//########################################################################
|
313
|
+
//### Structures used in the BMP functions
|
314
|
+
//########################################################################
|
315
|
+
|
316
|
+
typedef struct {
|
317
|
+
int size;
|
318
|
+
int reserved;
|
319
|
+
int offset;
|
320
|
+
} BMPHeader;
|
321
|
+
typedef struct {
|
322
|
+
int size;
|
323
|
+
int width;
|
324
|
+
int height;
|
325
|
+
int planesBitsPerPixel;
|
326
|
+
int compression;
|
327
|
+
int imageSize;
|
328
|
+
int xPelsPerMeter;
|
329
|
+
int yPelsPerMeter;
|
330
|
+
int clrUsed;
|
331
|
+
int clrImportant;
|
332
|
+
} BMPInfoHeader;
|
333
|
+
|
334
|
+
//########################################################################
|
335
|
+
//### Function to save BMP data to a file
|
336
|
+
//########################################################################
|
337
|
+
|
338
|
+
void SaveBMPFile(unsigned char ** image, const char * outputdestination, int width, int height)
|
339
|
+
{
|
340
|
+
// Variable declarations
|
341
|
+
int x,y,j;
|
342
|
+
FILE *fd_out;
|
343
|
+
unsigned long ulBitmapSize = (height * width * 3)+54;
|
344
|
+
char ucaBitmapSize[4];
|
345
|
+
ucaBitmapSize[3]= (ulBitmapSize & 0xFF000000) >> 24;
|
346
|
+
ucaBitmapSize[2]= (ulBitmapSize & 0x00FF0000) >> 16;
|
347
|
+
ucaBitmapSize[1]= (ulBitmapSize & 0x0000FF00) >> 8;
|
348
|
+
ucaBitmapSize[0]= (ulBitmapSize & 0x000000FF);
|
349
|
+
|
350
|
+
// Load output file
|
351
|
+
fd_out = fopen(outputdestination, "wb");
|
352
|
+
|
353
|
+
// Write BMP header
|
354
|
+
fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 66, 77, ucaBitmapSize[0], ucaBitmapSize[1], ucaBitmapSize[2], ucaBitmapSize[3], 0, 0, 0, 0);
|
355
|
+
fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 54, 0, 0, 0, 40, 0 , 0, 0, (width & 0x00FF), (width & 0xFF00)>>8);
|
356
|
+
fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 0, 0, (height & 0x00FF), (height & 0xFF00) >> 8, 0, 0, 1, 0, 24, 0);
|
357
|
+
fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
358
|
+
fprintf(fd_out,"%c%c%c%c%c%c%c%c%c%c", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
359
|
+
fprintf(fd_out,"%c%c%c%c", 0, 0 ,0, 0);
|
360
|
+
|
361
|
+
// Save RGB data to output file
|
362
|
+
for(y=0;y<height;y++) {
|
363
|
+
for(x=0;x<width;x++) {
|
364
|
+
fputc(image[x][y],fd_out);
|
365
|
+
fputc(image[x][y],fd_out);
|
366
|
+
fputc(image[x][y],fd_out);
|
367
|
+
}
|
368
|
+
int over = width%4;
|
369
|
+
if (over != 0) {
|
370
|
+
for(j=0;j<over;j++) {
|
371
|
+
fputc(0,fd_out);
|
372
|
+
}
|
373
|
+
}
|
374
|
+
}
|
375
|
+
|
376
|
+
// Clean up
|
377
|
+
fclose(fd_out);
|
378
|
+
}
|
379
|
+
|
380
|
+
//########################################################################
|
381
|
+
//### Function to load BMP data from disk
|
382
|
+
//########################################################################
|
383
|
+
|
384
|
+
unsigned char ** LoadBMPFile(int *width, int *height)
|
385
|
+
{
|
386
|
+
// Variable declarations
|
387
|
+
short type;
|
388
|
+
int temp;
|
389
|
+
BMPHeader hdr;
|
390
|
+
BMPInfoHeader infoHdr;
|
391
|
+
FILE *fd;
|
392
|
+
int i, y, x;
|
393
|
+
|
394
|
+
// Open the file stream
|
395
|
+
fd = fopen("../../../input.bmp","rb");
|
396
|
+
|
397
|
+
// Open the file and scan the contents
|
398
|
+
if(!(fd)) { printf("***BMP load error: file access denied***\n"); exit(0); }
|
399
|
+
temp = fread(&type, sizeof(short), 1, fd);
|
400
|
+
temp = fread(&hdr, sizeof(hdr), 1, fd);
|
401
|
+
if(type != 0x4D42) { printf("***BMP load error: bad file format***\n"); exit(0); }
|
402
|
+
temp = fread(&infoHdr, sizeof(infoHdr), 1, fd);
|
403
|
+
if((infoHdr.planesBitsPerPixel>>16) != 24) { printf("***BMP load error: invalid color depth (%d)*** \n",(infoHdr.planesBitsPerPixel>>16)); exit(0); }
|
404
|
+
if(infoHdr.compression) { printf("***BMP load error: compressed image***\n"); exit(0); }
|
405
|
+
(*width) = infoHdr.width;
|
406
|
+
(*height) = infoHdr.height;
|
407
|
+
|
408
|
+
// Allocate memory to store the BMP's contents
|
409
|
+
unsigned char ** image = (unsigned char **)malloc((*width) * sizeof(*image));
|
410
|
+
unsigned char * image_1D = (unsigned char *)malloc((*width) * (*height) * sizeof(unsigned char));
|
411
|
+
for(i=0; i<(*width); i++) {
|
412
|
+
image[i] = &image_1D[i*(*height)];
|
413
|
+
}
|
414
|
+
|
415
|
+
// Read the BMP file and store the contents
|
416
|
+
fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR);
|
417
|
+
for(y = 0; y < (*height); y++) {
|
418
|
+
for(x = 0; x < (*width); x++) {
|
419
|
+
image[x][y] = ((int)fgetc(fd));
|
420
|
+
fgetc(fd);
|
421
|
+
fgetc(fd);
|
422
|
+
}
|
423
|
+
int over = (4 - ((*width)*3) % 4) % 4;
|
424
|
+
if (over != 0) {
|
425
|
+
for(x = 0; x < over; x++) {
|
426
|
+
fgetc(fd);
|
427
|
+
}
|
428
|
+
}
|
429
|
+
}
|
430
|
+
|
431
|
+
// Exit the function and clean-up
|
432
|
+
if(ferror(fd)) {
|
433
|
+
printf("***Unknown BMP load error.***\n");
|
434
|
+
free(image[0]);
|
435
|
+
free(image);
|
436
|
+
exit(0);
|
437
|
+
}
|
438
|
+
fclose(fd);
|
439
|
+
return image;
|
440
|
+
}
|
441
|
+
|
442
|
+
//########################################################################
|
443
|
+
//### Find the center of a projection vector (using a state machine)
|
444
|
+
//########################################################################
|
445
|
+
void CPU_FindCenters(int* vector, int *coordinates, int size) {
|
446
|
+
int s;
|
447
|
+
int state = 0;
|
448
|
+
int count = 0;
|
449
|
+
int coordinate = 0;
|
450
|
+
for (s=0;s<size;s++) {
|
451
|
+
if (state == 0) { // Last thing I found was a zero
|
452
|
+
if (vector[s] == 255) { // I found a 255 now
|
453
|
+
state = 1;
|
454
|
+
count = 0;
|
455
|
+
}
|
456
|
+
}
|
457
|
+
if (state == 1) { // Last thing I found was 255
|
458
|
+
if (vector[s] == 0) { // I found a zero now
|
459
|
+
state = 0;
|
460
|
+
if (count > 4) { // To filter out noise
|
461
|
+
coordinates[coordinate] = s-(count/2);
|
462
|
+
coordinate++;
|
463
|
+
}
|
464
|
+
}
|
465
|
+
else { // I found a 255 again
|
466
|
+
count++;
|
467
|
+
}
|
468
|
+
}
|
469
|
+
}
|
470
|
+
}
|
471
|
+
|
472
|
+
//########################################################################
|
473
|
+
//### CPU kernel to visualize the results
|
474
|
+
//########################################################################
|
475
|
+
void CPU_Visualize(unsigned char** image0, int* Xcoordinates, int* Ycoordinates, unsigned char **image3, int width, int height) {
|
476
|
+
|
477
|
+
// Loop variables
|
478
|
+
int h, w, x, y;
|
479
|
+
|
480
|
+
// Copy the whole image
|
481
|
+
for (h=0;h<height;h++) {
|
482
|
+
for (w=0;w<width;w++) {
|
483
|
+
unsigned char value = image0[h][w];
|
484
|
+
image3[h][w] = value;
|
485
|
+
}
|
486
|
+
}
|
487
|
+
|
488
|
+
// Replace the centers with white pixels
|
489
|
+
for (x=0;x<XVECTORS;x++) {
|
490
|
+
for (y=0;y<YVECTORS;y++) {
|
491
|
+
image3[Xcoordinates[x]][Ycoordinates[y]] = 255;
|
492
|
+
}
|
493
|
+
}
|
494
|
+
}
|
495
|
+
|
496
|
+
//########################################################################
|
497
|
+
//### CPU kernel function for between class variance (BCV), part of Otsu thresholding
|
498
|
+
//########################################################################
|
499
|
+
void CPU_BCV(int *histogram, float *BCVtable, int size) {
|
500
|
+
int i;
|
501
|
+
|
502
|
+
// Initialize the BCV table to zero
|
503
|
+
for (i=0;i<256;i++) {
|
504
|
+
BCVtable[i] = 0;
|
505
|
+
}
|
506
|
+
|
507
|
+
// Pre-calculated the total of the weigthed sums
|
508
|
+
int wsumtotal = 0;
|
509
|
+
for (i=0;i<256;i++) {
|
510
|
+
wsumtotal = wsumtotal + i*histogram[i];
|
511
|
+
}
|
512
|
+
|
513
|
+
// Set the initial values
|
514
|
+
int sumb = 0;
|
515
|
+
int sumf = size;
|
516
|
+
int wsumb = 0;
|
517
|
+
int wsumf = wsumtotal;
|
518
|
+
|
519
|
+
float wb;
|
520
|
+
float wf;
|
521
|
+
float meanb;
|
522
|
+
float meanf;
|
523
|
+
|
524
|
+
// Iterate over all possible threshold values
|
525
|
+
for (i=0;i<256;i++) {
|
526
|
+
|
527
|
+
// Update the weighted sums
|
528
|
+
wsumb = wsumb + i*histogram[i];
|
529
|
+
wsumf = wsumtotal - wsumb;
|
530
|
+
|
531
|
+
// Calculate the necessary components
|
532
|
+
wb = sumb / (float)size;
|
533
|
+
wf = sumf / (float)size;
|
534
|
+
meanb = wsumb / (float)sumb;
|
535
|
+
meanf = wsumf / (float)sumf;
|
536
|
+
|
537
|
+
// Stop if the sum of foreground is equal to zero
|
538
|
+
if (sumf == 0) { break; }
|
539
|
+
|
540
|
+
// Output the BCV value
|
541
|
+
BCVtable[i] = wb*wf*(meanb-meanf)*(meanb-meanf);
|
542
|
+
|
543
|
+
// If the sum of the background was equal to zero, BCV table will be NaN and must be set to zero
|
544
|
+
if (sumb == 0) { BCVtable[i] = 0; }
|
545
|
+
|
546
|
+
// Update the sum of the background (all darker pixels compared to the current pixel) and foreground pixels (the rest)
|
547
|
+
sumb = sumb + histogram[i];
|
548
|
+
sumf = size - sumb;
|
549
|
+
}
|
550
|
+
}
|
551
|
+
|
552
|
+
//########################################################################
|