bones-compiler 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module C
|
|
2
|
+
# This class provides an extention to the CAST type class. It
|
|
3
|
+
# contains a number of functions applicable to types such as
|
|
4
|
+
# pointers, arrays, structures, floats, integers, etc.
|
|
5
|
+
#
|
|
6
|
+
# The provided methods are just helpers to extend the CAST
|
|
7
|
+
# functionality and to clean-up the Bones classes.
|
|
8
|
+
class Type
|
|
9
|
+
|
|
10
|
+
# This method is used to determine whether the variable is
|
|
11
|
+
# an array and/or a pointer. Returns either true or false.
|
|
12
|
+
def array_or_pointer?
|
|
13
|
+
((self.class == C::Array) || (self.class == C::Pointer))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# This method recursively searches for the type of a variable.
|
|
17
|
+
# Recursion is needed when a type is an array or a pointer.
|
|
18
|
+
# The method eventually returns one of the CAST algorithm
|
|
19
|
+
# types being either: void, int, float, char, bool, complex
|
|
20
|
+
# or imaginary.
|
|
21
|
+
def type_name
|
|
22
|
+
(self.array_or_pointer?) ? self.type.type_name : self
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# This method returns the variable's dimension as an integer.
|
|
26
|
+
# it uses recursion in case the type is an array or a pointer.
|
|
27
|
+
# Types that are neither arrays nor pointers have a dimension
|
|
28
|
+
# of zero. For arrays and pointers, each '*' or '[]' contributes
|
|
29
|
+
# to one additional dimension.
|
|
30
|
+
def dimensions(count=0)
|
|
31
|
+
(self.array_or_pointer?) ? self.type.dimensions(count+1) : count
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
end
|
|
37
|
+
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
|
|
2
|
+
// Multiple iterations for measurements
|
|
3
|
+
#define ITERS 1
|
|
4
|
+
|
|
5
|
+
// Declaration of the original function
|
|
6
|
+
int bones_main(void);
|
|
7
|
+
|
|
8
|
+
// New main function for initialisation and clean-up
|
|
9
|
+
int main(void) {
|
|
10
|
+
|
|
11
|
+
// Original main function
|
|
12
|
+
int bones_return = bones_main();
|
|
13
|
+
|
|
14
|
+
// Clean-up
|
|
15
|
+
return bones_return;
|
|
16
|
+
}
|
|
17
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#include <math.h>
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
// Initialize the timer
|
|
2
|
+
float bones_timer2 = 0;
|
|
3
|
+
struct timeval bones_start_time2;
|
|
4
|
+
struct timeval bones_end_time2;
|
|
5
|
+
for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
|
|
6
|
+
|
|
7
|
+
// Flush the CPU cache (for measurement purposes only)
|
|
8
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
|
9
|
+
int bones_flush_i;
|
|
10
|
+
int bones_flush_j;
|
|
11
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
|
12
|
+
for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
|
|
13
|
+
for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
|
|
14
|
+
bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
free(bones_flush_c);
|
|
18
|
+
|
|
19
|
+
// Start the timer for the measurement of the kernel execution time
|
|
20
|
+
gettimeofday(&bones_start_time2, NULL);
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
|
|
2
|
+
// Stop the timer for the measurement of the kernel execution time
|
|
3
|
+
gettimeofday(&bones_end_time2, NULL);
|
|
4
|
+
bones_timer2 += 0.001 * (1000000*(bones_end_time2.tv_sec-bones_start_time2.tv_sec)+bones_end_time2.tv_usec-bones_start_time2.tv_usec);
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
// Print the measurement data
|
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2/((float)ITERS));
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
// Start of the <algorithm_name> kernel
|
|
5
|
+
void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
6
|
+
for(int bones_global_id=0; bones_global_id<<parallelism>; bones_global_id++) {
|
|
7
|
+
|
|
8
|
+
// Calculate the global ID(s) based on the thread id
|
|
9
|
+
<ids>
|
|
10
|
+
|
|
11
|
+
// Perform the main computation
|
|
12
|
+
<algorithm_code1>
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
###################################################################
|
|
2
|
+
# Each line holds one mapping from species to skeleton
|
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
|
7
|
+
# '+' denotes one or more of these patterns
|
|
8
|
+
###################################################################
|
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
13
|
+
D|chunk(D)+ -> D|element+ :default :00
|
|
14
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
15
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
|
16
|
+
N|neighbourhood(N)+ -> N|element+ :default :00
|
|
17
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
|
18
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
19
|
+
D|element+ -> D|chunk(D)+ :default :00
|
|
20
|
+
D|element+ -> D|element+ :default :00
|
|
21
|
+
D|element -> 1|shared :default :00
|
|
22
|
+
D|element+ -> D|shared+ :default :00
|
|
23
|
+
D|element+ -> D|element+ ^ D|shared+ :default :00
|
|
24
|
+
D|void -> D|element+ :default :00
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#include <string.h>
|
|
2
|
+
#include <stdio.h>
|
|
3
|
+
#include <stdlib.h>
|
|
4
|
+
#include <math.h>
|
|
5
|
+
#include <sys/time.h>
|
|
6
|
+
#include <CL/cl.h>
|
|
7
|
+
|
|
8
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
|
9
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
|
10
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
|
11
|
+
#define DIV_FLOOR(a,b) (a/b)
|
|
12
|
+
|
|
13
|
+
// Multiple iterations for kernel measurements
|
|
14
|
+
#define ITERS 1
|
|
15
|
+
|
|
16
|
+
// Load the OpenCL kernel from file
|
|
17
|
+
char * get_source(const char* bones_filename) {
|
|
18
|
+
FILE* bones_fp = fopen(bones_filename,"r");
|
|
19
|
+
fseek(bones_fp,0,SEEK_END);
|
|
20
|
+
long bones_size = ftell(bones_fp);
|
|
21
|
+
rewind(bones_fp);
|
|
22
|
+
char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
|
|
23
|
+
int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
|
|
24
|
+
bones_source[bones_size] = '\0';
|
|
25
|
+
fclose(bones_fp);
|
|
26
|
+
return bones_source;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Print an error if it occurs
|
|
30
|
+
void error_check(cl_int bones_errors) {
|
|
31
|
+
if(bones_errors != CL_SUCCESS) {
|
|
32
|
+
switch (bones_errors) {
|
|
33
|
+
case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
|
|
34
|
+
case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
|
|
35
|
+
case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
|
|
36
|
+
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
|
|
37
|
+
case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
|
|
38
|
+
case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
|
|
39
|
+
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
|
|
40
|
+
case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
|
|
41
|
+
case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
|
|
42
|
+
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
|
|
43
|
+
case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
|
|
44
|
+
case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
|
|
45
|
+
case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
|
|
46
|
+
case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
|
|
47
|
+
case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
|
|
48
|
+
case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
|
|
49
|
+
case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
|
|
50
|
+
case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
|
|
51
|
+
case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
|
|
52
|
+
case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
|
|
53
|
+
case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
|
|
54
|
+
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
|
|
55
|
+
case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
|
|
56
|
+
case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
|
|
57
|
+
case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
|
|
58
|
+
case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
|
|
59
|
+
case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
|
|
60
|
+
case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
|
|
61
|
+
case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
|
|
62
|
+
case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
|
|
63
|
+
case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
|
|
64
|
+
case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
|
|
65
|
+
case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
|
|
66
|
+
case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
|
|
67
|
+
case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
|
|
68
|
+
case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
|
|
69
|
+
case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
|
|
70
|
+
case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
|
|
71
|
+
case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
|
|
72
|
+
case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
|
|
73
|
+
case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
|
|
74
|
+
case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
|
|
75
|
+
case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
|
|
76
|
+
case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
|
|
77
|
+
case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
|
|
78
|
+
default: printf("--- Error: Unknown with code %d\n", bones_errors);
|
|
79
|
+
}
|
|
80
|
+
fflush(stdout); exit(0);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Use a global variable for the device ID, context and command queue
|
|
85
|
+
cl_device_id bones_device;
|
|
86
|
+
cl_context bones_context;
|
|
87
|
+
cl_command_queue bones_queue;
|
|
88
|
+
|
|
89
|
+
// Use a global variable to store the name and the binary for the last program
|
|
90
|
+
char bones_last_program[1024];
|
|
91
|
+
cl_program bones_program;
|
|
92
|
+
|
|
93
|
+
// Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
|
|
94
|
+
void bones_initialize_target(void) {
|
|
95
|
+
cl_int bones_errors;
|
|
96
|
+
|
|
97
|
+
// Get OpenCL platform count
|
|
98
|
+
cl_uint bones_num_platforms;
|
|
99
|
+
bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
|
|
100
|
+
if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
|
|
101
|
+
|
|
102
|
+
// Get all OpenCL platform IDs
|
|
103
|
+
cl_platform_id bones_platform_ids[10];
|
|
104
|
+
bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
|
|
105
|
+
|
|
106
|
+
// Select the AMD APP platform
|
|
107
|
+
char bones_buffer[1024];
|
|
108
|
+
cl_uint bones_platform;
|
|
109
|
+
for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
|
|
110
|
+
clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
|
|
111
|
+
if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Get a CPU device on the platform
|
|
115
|
+
bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_CPU, 1, &bones_device, NULL); error_check(bones_errors);
|
|
116
|
+
bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
|
|
117
|
+
|
|
118
|
+
// Create a context
|
|
119
|
+
bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
|
|
120
|
+
|
|
121
|
+
// Create a command queue
|
|
122
|
+
bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
|
|
123
|
+
|
|
124
|
+
// Create space on the device
|
|
125
|
+
cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
|
|
126
|
+
|
|
127
|
+
// Copy something to the device
|
|
128
|
+
bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
|
|
129
|
+
|
|
130
|
+
// Clean-up the OpenCL context
|
|
131
|
+
strcpy(bones_last_program,"");
|
|
132
|
+
clReleaseMemObject(bones_device_data);
|
|
133
|
+
clReleaseContext(bones_context);
|
|
134
|
+
fflush(stdout);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Declaration of the original function
|
|
138
|
+
int bones_main(void);
|
|
139
|
+
|
|
140
|
+
// New main function for initialisation and clean-up
|
|
141
|
+
int main(void) {
|
|
142
|
+
|
|
143
|
+
// Initialisation
|
|
144
|
+
bones_initialize_target();
|
|
145
|
+
|
|
146
|
+
// Original main function
|
|
147
|
+
int bones_return = bones_main();
|
|
148
|
+
|
|
149
|
+
// Clean-up
|
|
150
|
+
clReleaseCommandQueue(bones_queue);
|
|
151
|
+
clReleaseProgram(bones_program);
|
|
152
|
+
clReleaseContext(bones_context);
|
|
153
|
+
return bones_return;
|
|
154
|
+
}
|
|
155
|
+
|
|
File without changes
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
|
|
2
|
+
// Perform a zero-copy of <array> from device to host
|
|
3
|
+
//void* bones_pointer_to_<array> = clEnqueueMapBuffer(bones_queue,device_<array>,CL_TRUE,CL_MAP_READ,<offset>,<variable_dimensions>*sizeof(<type>),0,NULL,NULL,&bones_errors); error_check(bones_errors);
|
|
4
|
+
//clEnqueueUnmapMemObject(bones_queue,device_<array>,bones_pointer_to_<array>,0,NULL,NULL);
|
|
5
|
+
|
|
6
|
+
// Perform a copy of <array> from device to host
|
|
7
|
+
clEnqueueReadBuffer(bones_queue,device_<array>,CL_TRUE,(<offset>)*sizeof(<type>),<variable_dimensions>*sizeof(<type>),<array><flatten>+<offset>,0,NULL,NULL);
|
|
8
|
+
clFinish(bones_queue);
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
|
|
2
|
+
// Create a device pointer for <array> (zero-copy)
|
|
3
|
+
//cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,&bones_errors); error_check(bones_errors);
|
|
4
|
+
|
|
5
|
+
// Create a device pointer for <array>
|
|
6
|
+
cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,<variable_dimensions>*sizeof(<type>),NULL,&bones_errors); error_check(bones_errors);
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
fflush(stdout);
|
|
2
|
+
cl_int bones_errors;
|
|
3
|
+
cl_event bones_event;
|
|
4
|
+
|
|
5
|
+
// Only compile if this program is different from the last one
|
|
6
|
+
if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
|
|
7
|
+
strcpy(bones_last_program,"<algorithm_filename>");
|
|
8
|
+
|
|
9
|
+
// Load and compile the kernel
|
|
10
|
+
char *bones_source = get_source("<algorithm_filename>_device.cl");
|
|
11
|
+
bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
|
|
12
|
+
bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
|
|
13
|
+
|
|
14
|
+
// Get and print the compiler log
|
|
15
|
+
char* bones_log;
|
|
16
|
+
size_t bones_log_size;
|
|
17
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
|
|
18
|
+
bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
|
|
19
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
|
|
20
|
+
bones_log[bones_log_size] = '\0';
|
|
21
|
+
//if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
|
|
22
|
+
free(bones_log);
|
|
23
|
+
error_check(bones_errors);
|
|
24
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
|
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
|
3
|
+
#if (ITERS == 1)
|
|
4
|
+
clFinish(bones_queue);
|
|
5
|
+
struct timeval bones_end_time1;
|
|
6
|
+
gettimeofday(&bones_end_time1, NULL);
|
|
7
|
+
float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
|
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
|
9
|
+
#endif
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
// Start the timer for the measurement of the kernel execution time
|
|
3
|
+
clFinish(bones_queue);
|
|
4
|
+
for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
|
|
5
|
+
|
|
6
|
+
// Flush the CPU cache (for measurement purposes only)
|
|
7
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
|
8
|
+
int bones_flush_i;
|
|
9
|
+
int bones_flush_j;
|
|
10
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
|
11
|
+
for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
|
|
12
|
+
for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
|
|
13
|
+
bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
free(bones_flush_c);
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
|
|
2
|
+
}
|
|
3
|
+
|
|
4
|
+
// Stop the timer for the measurement of the kernel execution time
|
|
5
|
+
clFinish(bones_queue);
|
|
6
|
+
cl_ulong end2, start2;
|
|
7
|
+
bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
|
|
8
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
|
|
9
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
|
|
10
|
+
float bones_timer2 = 0.000001 * (end2-start2);
|
|
11
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
|
|
2
|
+
// Store the initial value
|
|
3
|
+
cl_mem bones_initial_value = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(<out0_type>),<out0_name>,&bones_errors); error_check(bones_errors);
|
|
4
|
+
|
|
5
|
+
// Create the kernels
|
|
6
|
+
cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
|
|
7
|
+
cl_kernel bones_kernel_<algorithm_name>_1 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_1", &bones_errors); error_check(bones_errors);
|
|
8
|
+
cl_kernel bones_kernel_<algorithm_name>_2 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_2", &bones_errors); error_check(bones_errors);
|
|
9
|
+
|
|
10
|
+
// Run either one kernel or multiple kernels
|
|
11
|
+
if (<in0_dimensions> <= 512) {
|
|
12
|
+
|
|
13
|
+
// Set all the arguments to the kernel function
|
|
14
|
+
int bones_num_args = 3;
|
|
15
|
+
int bones_dimensions = <in0_dimensions>;
|
|
16
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
|
|
17
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
|
|
18
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
|
19
|
+
<kernel_argument_list_constants>
|
|
20
|
+
// Start only one kernel
|
|
21
|
+
const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
|
|
22
|
+
size_t bones_local_worksize1[] = {bones_num_threads};
|
|
23
|
+
size_t bones_global_worksize1[] = {bones_num_threads};
|
|
24
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
|
|
25
|
+
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
|
|
29
|
+
// Allocate space for an intermediate array
|
|
30
|
+
cl_mem bones_device_temp = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,128*sizeof(<out0_type>),NULL,&bones_errors); error_check(bones_errors);
|
|
31
|
+
|
|
32
|
+
// Set all the arguments to the kernel function
|
|
33
|
+
int bones_num_args = 3;
|
|
34
|
+
int bones_dimensions = <in0_dimensions>;
|
|
35
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
|
|
36
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
|
|
37
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(bones_device_temp),(void*)&bones_device_temp);
|
|
38
|
+
<kernel_argument_list_constants>
|
|
39
|
+
// Start the first kernel
|
|
40
|
+
size_t bones_local_worksize1[] = {256};
|
|
41
|
+
size_t bones_global_worksize1[] = {256*128};
|
|
42
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
|
|
43
|
+
|
|
44
|
+
// Set all the arguments to the kernel function
|
|
45
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_1,0,sizeof(bones_device_temp),(void*)&bones_device_temp);
|
|
46
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_1,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
|
47
|
+
// Start the second kernel
|
|
48
|
+
size_t bones_local_worksize2[] = {128};
|
|
49
|
+
size_t bones_global_worksize2[] = {128};
|
|
50
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_1,1,NULL,bones_global_worksize2,bones_local_worksize2,0,NULL,&bones_event); error_check(bones_errors);
|
|
51
|
+
clReleaseMemObject(bones_device_temp);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Set all the arguments to the kernel function
|
|
55
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_2,0,sizeof(bones_initial_value),(void*)&bones_initial_value);
|
|
56
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_2,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
|
57
|
+
// Perform the last computation (only needed if there is an initial value)
|
|
58
|
+
size_t bones_local_worksize3[] = {1};
|
|
59
|
+
size_t bones_global_worksize3[] = {1};
|
|
60
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_2,1,NULL,bones_global_worksize3,bones_local_worksize3,0,NULL,&bones_event); error_check(bones_errors);
|
|
61
|
+
clReleaseMemObject(bones_initial_value);
|
|
62
|
+
|
|
63
|
+
// Synchronize and clean-up the kernels
|
|
64
|
+
clFinish(bones_queue);
|
|
65
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_0);
|
|
66
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_1);
|
|
67
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_2);
|