bones-compiler 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
module C
|
2
|
+
# This class provides an extention to the CAST type class. It
|
3
|
+
# contains a number of functions applicable to types such as
|
4
|
+
# pointers, arrays, structures, floats, integers, etc.
|
5
|
+
#
|
6
|
+
# The provided methods are just helpers to extend the CAST
|
7
|
+
# functionality and to clean-up the Bones classes.
|
8
|
+
class Type
|
9
|
+
|
10
|
+
# This method is used to determine whether the variable is
|
11
|
+
# an array and/or a pointer. Returns either true or false.
|
12
|
+
def array_or_pointer?
|
13
|
+
((self.class == C::Array) || (self.class == C::Pointer))
|
14
|
+
end
|
15
|
+
|
16
|
+
# This method recursively searches for the type of a variable.
|
17
|
+
# Recursion is needed when a type is an array or a pointer.
|
18
|
+
# The method eventually returns one of the CAST algorithm
|
19
|
+
# types being either: void, int, float, char, bool, complex
|
20
|
+
# or imaginary.
|
21
|
+
def type_name
|
22
|
+
(self.array_or_pointer?) ? self.type.type_name : self
|
23
|
+
end
|
24
|
+
|
25
|
+
# This method returns the variable's dimension as an integer.
|
26
|
+
# it uses recursion in case the type is an array or a pointer.
|
27
|
+
# Types that are neither arrays nor pointers have a dimension
|
28
|
+
# of zero. For arrays and pointers, each '*' or '[]' contributes
|
29
|
+
# to one additional dimension.
|
30
|
+
def dimensions(count=0)
|
31
|
+
(self.array_or_pointer?) ? self.type.dimensions(count+1) : count
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
// Multiple iterations for measurements
|
3
|
+
#define ITERS 1
|
4
|
+
|
5
|
+
// Declaration of the original function
|
6
|
+
int bones_main(void);
|
7
|
+
|
8
|
+
// New main function for initialisation and clean-up
|
9
|
+
int main(void) {
|
10
|
+
|
11
|
+
// Original main function
|
12
|
+
int bones_return = bones_main();
|
13
|
+
|
14
|
+
// Clean-up
|
15
|
+
return bones_return;
|
16
|
+
}
|
17
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
#include <math.h>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,20 @@
|
|
1
|
+
// Initialize the timer
|
2
|
+
float bones_timer2 = 0;
|
3
|
+
struct timeval bones_start_time2;
|
4
|
+
struct timeval bones_end_time2;
|
5
|
+
for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
|
6
|
+
|
7
|
+
// Flush the CPU cache (for measurement purposes only)
|
8
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
9
|
+
int bones_flush_i;
|
10
|
+
int bones_flush_j;
|
11
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
12
|
+
for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
|
13
|
+
for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
|
14
|
+
bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
|
15
|
+
}
|
16
|
+
}
|
17
|
+
free(bones_flush_c);
|
18
|
+
|
19
|
+
// Start the timer for the measurement of the kernel execution time
|
20
|
+
gettimeofday(&bones_start_time2, NULL);
|
@@ -0,0 +1,8 @@
|
|
1
|
+
|
2
|
+
// Stop the timer for the measurement of the kernel execution time
|
3
|
+
gettimeofday(&bones_end_time2, NULL);
|
4
|
+
bones_timer2 += 0.001 * (1000000*(bones_end_time2.tv_sec-bones_start_time2.tv_sec)+bones_end_time2.tv_usec-bones_start_time2.tv_usec);
|
5
|
+
}
|
6
|
+
|
7
|
+
// Print the measurement data
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2/((float)ITERS));
|
@@ -0,0 +1,15 @@
|
|
1
|
+
/* STARTDEF
|
2
|
+
void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
3
|
+
ENDDEF */
|
4
|
+
// Start of the <algorithm_name> kernel
|
5
|
+
void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
6
|
+
for(int bones_global_id=0; bones_global_id<<parallelism>; bones_global_id++) {
|
7
|
+
|
8
|
+
// Calculate the global ID(s) based on the thread id
|
9
|
+
<ids>
|
10
|
+
|
11
|
+
// Perform the main computation
|
12
|
+
<algorithm_code1>
|
13
|
+
}
|
14
|
+
}
|
15
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
###################################################################
|
2
|
+
# Each line holds one mapping from species to skeleton
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
7
|
+
# '+' denotes one or more of these patterns
|
8
|
+
###################################################################
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
13
|
+
D|chunk(D)+ -> D|element+ :default :00
|
14
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
15
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
16
|
+
N|neighbourhood(N)+ -> N|element+ :default :00
|
17
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
18
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
19
|
+
D|element+ -> D|chunk(D)+ :default :00
|
20
|
+
D|element+ -> D|element+ :default :00
|
21
|
+
D|element -> 1|shared :default :00
|
22
|
+
D|element+ -> D|shared+ :default :00
|
23
|
+
D|element+ -> D|element+ ^ D|shared+ :default :00
|
24
|
+
D|void -> D|element+ :default :00
|
@@ -0,0 +1,155 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include <sys/time.h>
|
6
|
+
#include <CL/cl.h>
|
7
|
+
|
8
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
9
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
10
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
11
|
+
#define DIV_FLOOR(a,b) (a/b)
|
12
|
+
|
13
|
+
// Multiple iterations for kernel measurements
|
14
|
+
#define ITERS 1
|
15
|
+
|
16
|
+
// Load the OpenCL kernel from file
|
17
|
+
char * get_source(const char* bones_filename) {
|
18
|
+
FILE* bones_fp = fopen(bones_filename,"r");
|
19
|
+
fseek(bones_fp,0,SEEK_END);
|
20
|
+
long bones_size = ftell(bones_fp);
|
21
|
+
rewind(bones_fp);
|
22
|
+
char *bones_source = (char *)malloc(sizeof(char)*(bones_size+1));
|
23
|
+
int bones_temp = fread(bones_source,1,sizeof(char)*bones_size,bones_fp);
|
24
|
+
bones_source[bones_size] = '\0';
|
25
|
+
fclose(bones_fp);
|
26
|
+
return bones_source;
|
27
|
+
}
|
28
|
+
|
29
|
+
// Print an error if it occurs
|
30
|
+
void error_check(cl_int bones_errors) {
|
31
|
+
if(bones_errors != CL_SUCCESS) {
|
32
|
+
switch (bones_errors) {
|
33
|
+
case CL_DEVICE_NOT_FOUND: printf("--- Error: Device not found.\n"); break;
|
34
|
+
case CL_DEVICE_NOT_AVAILABLE: printf("--- Error: Device not available\n"); break;
|
35
|
+
case CL_COMPILER_NOT_AVAILABLE: printf("--- Error: Compiler not available\n"); break;
|
36
|
+
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("--- Error: Memory object allocation failure\n"); break;
|
37
|
+
case CL_OUT_OF_RESOURCES: printf("--- Error: Out of resources\n"); break;
|
38
|
+
case CL_OUT_OF_HOST_MEMORY: printf("--- Error: Out of host memory\n"); break;
|
39
|
+
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("--- Error: Profiling information not available\n"); break;
|
40
|
+
case CL_MEM_COPY_OVERLAP: printf("--- Error: Memory copy overlap\n"); break;
|
41
|
+
case CL_IMAGE_FORMAT_MISMATCH: printf("--- Error: Image format mismatch\n"); break;
|
42
|
+
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("--- Error: Image format not supported\n"); break;
|
43
|
+
case CL_BUILD_PROGRAM_FAILURE: printf("--- Error: Program build failure\n"); break;
|
44
|
+
case CL_MAP_FAILURE: printf("--- Error: Map failure\n"); break;
|
45
|
+
case CL_INVALID_VALUE: printf("--- Error: Invalid value\n"); break;
|
46
|
+
case CL_INVALID_DEVICE_TYPE: printf("--- Error: Invalid device type\n"); break;
|
47
|
+
case CL_INVALID_PLATFORM: printf("--- Error: Invalid platform\n"); break;
|
48
|
+
case CL_INVALID_DEVICE: printf("--- Error: Invalid device\n"); break;
|
49
|
+
case CL_INVALID_CONTEXT: printf("--- Error: Invalid context\n"); break;
|
50
|
+
case CL_INVALID_QUEUE_PROPERTIES: printf("--- Error: Invalid queue properties\n"); break;
|
51
|
+
case CL_INVALID_COMMAND_QUEUE: printf("--- Error: Invalid command queue\n"); break;
|
52
|
+
case CL_INVALID_HOST_PTR: printf("--- Error: Invalid host pointer\n"); break;
|
53
|
+
case CL_INVALID_MEM_OBJECT: printf("--- Error: Invalid memory object\n"); break;
|
54
|
+
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("--- Error: Invalid image format descriptor\n"); break;
|
55
|
+
case CL_INVALID_IMAGE_SIZE: printf("--- Error: Invalid image size\n"); break;
|
56
|
+
case CL_INVALID_SAMPLER: printf("--- Error: Invalid sampler\n"); break;
|
57
|
+
case CL_INVALID_BINARY: printf("--- Error: Invalid binary\n"); break;
|
58
|
+
case CL_INVALID_BUILD_OPTIONS: printf("--- Error: Invalid build options\n"); break;
|
59
|
+
case CL_INVALID_PROGRAM: printf("--- Error: Invalid program\n"); break;
|
60
|
+
case CL_INVALID_PROGRAM_EXECUTABLE: printf("--- Error: Invalid program executable\n"); break;
|
61
|
+
case CL_INVALID_KERNEL_NAME: printf("--- Error: Invalid kernel name\n"); break;
|
62
|
+
case CL_INVALID_KERNEL_DEFINITION: printf("--- Error: Invalid kernel definition\n"); break;
|
63
|
+
case CL_INVALID_KERNEL: printf("--- Error: Invalid kernel\n"); break;
|
64
|
+
case CL_INVALID_ARG_INDEX: printf("--- Error: Invalid argument index\n"); break;
|
65
|
+
case CL_INVALID_ARG_VALUE: printf("--- Error: Invalid argument value\n"); break;
|
66
|
+
case CL_INVALID_ARG_SIZE: printf("--- Error: Invalid argument size\n"); break;
|
67
|
+
case CL_INVALID_KERNEL_ARGS: printf("--- Error: Invalid kernel arguments\n"); break;
|
68
|
+
case CL_INVALID_WORK_DIMENSION: printf("--- Error: Invalid work dimensionsension\n"); break;
|
69
|
+
case CL_INVALID_WORK_GROUP_SIZE: printf("--- Error: Invalid work group size\n"); break;
|
70
|
+
case CL_INVALID_WORK_ITEM_SIZE: printf("--- Error: Invalid work item size\n"); break;
|
71
|
+
case CL_INVALID_GLOBAL_OFFSET: printf("--- Error: Invalid global offset\n"); break;
|
72
|
+
case CL_INVALID_EVENT_WAIT_LIST: printf("--- Error: Invalid event wait list\n"); break;
|
73
|
+
case CL_INVALID_EVENT: printf("--- Error: Invalid event\n"); break;
|
74
|
+
case CL_INVALID_OPERATION: printf("--- Error: Invalid operation\n"); break;
|
75
|
+
case CL_INVALID_GL_OBJECT: printf("--- Error: Invalid OpenGL object\n"); break;
|
76
|
+
case CL_INVALID_BUFFER_SIZE: printf("--- Error: Invalid buffer size\n"); break;
|
77
|
+
case CL_INVALID_MIP_LEVEL: printf("--- Error: Invalid mip-map level\n"); break;
|
78
|
+
default: printf("--- Error: Unknown with code %d\n", bones_errors);
|
79
|
+
}
|
80
|
+
fflush(stdout); exit(0);
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
// Use a global variable for the device ID, context and command queue
|
85
|
+
cl_device_id bones_device;
|
86
|
+
cl_context bones_context;
|
87
|
+
cl_command_queue bones_queue;
|
88
|
+
|
89
|
+
// Use a global variable to store the name and the binary for the last program
|
90
|
+
char bones_last_program[1024];
|
91
|
+
cl_program bones_program;
|
92
|
+
|
93
|
+
// Function to initialize the OpenCL platform (create to ensure fair measurements afterwards)
|
94
|
+
void bones_initialize_target(void) {
|
95
|
+
cl_int bones_errors;
|
96
|
+
|
97
|
+
// Get OpenCL platform count
|
98
|
+
cl_uint bones_num_platforms;
|
99
|
+
bones_errors = clGetPlatformIDs(0,NULL,&bones_num_platforms); error_check(bones_errors);
|
100
|
+
if (bones_num_platforms == 0) { printf("Error: No OpenCL platforms found.\n"); exit(1); }
|
101
|
+
|
102
|
+
// Get all OpenCL platform IDs
|
103
|
+
cl_platform_id bones_platform_ids[10];
|
104
|
+
bones_errors = clGetPlatformIDs(bones_num_platforms,bones_platform_ids,NULL); error_check(bones_errors);
|
105
|
+
|
106
|
+
// Select the AMD APP platform
|
107
|
+
char bones_buffer[1024];
|
108
|
+
cl_uint bones_platform;
|
109
|
+
for(cl_uint bones_platform_id=0; bones_platform_id<bones_num_platforms; bones_platform_id++) {
|
110
|
+
clGetPlatformInfo(bones_platform_ids[bones_platform_id], CL_PLATFORM_NAME, 1024, bones_buffer, NULL);
|
111
|
+
if(strstr(bones_buffer,"AMD") != NULL) { bones_platform = bones_platform_id; break; }
|
112
|
+
}
|
113
|
+
|
114
|
+
// Get a CPU device on the platform
|
115
|
+
bones_errors = clGetDeviceIDs(bones_platform_ids[bones_platform], CL_DEVICE_TYPE_CPU, 1, &bones_device, NULL); error_check(bones_errors);
|
116
|
+
bones_errors = clGetDeviceInfo(bones_device, CL_DEVICE_NAME, sizeof(bones_buffer), bones_buffer, NULL); error_check(bones_errors);
|
117
|
+
|
118
|
+
// Create a context
|
119
|
+
bones_context = clCreateContext(0,1,&bones_device,NULL,NULL,&bones_errors); error_check(bones_errors);
|
120
|
+
|
121
|
+
// Create a command queue
|
122
|
+
bones_queue = clCreateCommandQueue(bones_context,bones_device,CL_QUEUE_PROFILING_ENABLE,&bones_errors); error_check(bones_errors);
|
123
|
+
|
124
|
+
// Create space on the device
|
125
|
+
cl_mem bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,4,NULL,&bones_errors); error_check(bones_errors);
|
126
|
+
|
127
|
+
// Copy something to the device
|
128
|
+
bones_device_data = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,4,bones_buffer,NULL);
|
129
|
+
|
130
|
+
// Clean-up the OpenCL context
|
131
|
+
strcpy(bones_last_program,"");
|
132
|
+
clReleaseMemObject(bones_device_data);
|
133
|
+
clReleaseContext(bones_context);
|
134
|
+
fflush(stdout);
|
135
|
+
}
|
136
|
+
|
137
|
+
// Declaration of the original function
|
138
|
+
int bones_main(void);
|
139
|
+
|
140
|
+
// New main function for initialisation and clean-up
|
141
|
+
int main(void) {
|
142
|
+
|
143
|
+
// Initialisation
|
144
|
+
bones_initialize_target();
|
145
|
+
|
146
|
+
// Original main function
|
147
|
+
int bones_return = bones_main();
|
148
|
+
|
149
|
+
// Clean-up
|
150
|
+
clReleaseCommandQueue(bones_queue);
|
151
|
+
clReleaseProgram(bones_program);
|
152
|
+
clReleaseContext(bones_context);
|
153
|
+
return bones_return;
|
154
|
+
}
|
155
|
+
|
File without changes
|
@@ -0,0 +1,8 @@
|
|
1
|
+
|
2
|
+
// Perform a zero-copy of <array> from device to host
|
3
|
+
//void* bones_pointer_to_<array> = clEnqueueMapBuffer(bones_queue,device_<array>,CL_TRUE,CL_MAP_READ,<offset>,<variable_dimensions>*sizeof(<type>),0,NULL,NULL,&bones_errors); error_check(bones_errors);
|
4
|
+
//clEnqueueUnmapMemObject(bones_queue,device_<array>,bones_pointer_to_<array>,0,NULL,NULL);
|
5
|
+
|
6
|
+
// Perform a copy of <array> from device to host
|
7
|
+
clEnqueueReadBuffer(bones_queue,device_<array>,CL_TRUE,(<offset>)*sizeof(<type>),<variable_dimensions>*sizeof(<type>),<array><flatten>+<offset>,0,NULL,NULL);
|
8
|
+
clFinish(bones_queue);
|
@@ -0,0 +1,6 @@
|
|
1
|
+
|
2
|
+
// Create a device pointer for <array> (zero-copy)
|
3
|
+
//cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,<variable_dimensions>*sizeof(<type>),<array><flatten>,&bones_errors); error_check(bones_errors);
|
4
|
+
|
5
|
+
// Create a device pointer for <array>
|
6
|
+
cl_mem device_<array> = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,<variable_dimensions>*sizeof(<type>),NULL,&bones_errors); error_check(bones_errors);
|
@@ -0,0 +1,24 @@
|
|
1
|
+
fflush(stdout);
|
2
|
+
cl_int bones_errors;
|
3
|
+
cl_event bones_event;
|
4
|
+
|
5
|
+
// Only compile if this program is different from the last one
|
6
|
+
if (strcmp(bones_last_program,"<algorithm_filename>") != 0) {
|
7
|
+
strcpy(bones_last_program,"<algorithm_filename>");
|
8
|
+
|
9
|
+
// Load and compile the kernel
|
10
|
+
char *bones_source = get_source("<algorithm_filename>_device.cl");
|
11
|
+
bones_program = clCreateProgramWithSource(bones_context,1,(const char **)&bones_source,NULL,&bones_errors); error_check(bones_errors);
|
12
|
+
bones_errors = clBuildProgram(bones_program,0,NULL,"-cl-single-precision-constant",NULL,NULL);
|
13
|
+
|
14
|
+
// Get and print the compiler log
|
15
|
+
char* bones_log;
|
16
|
+
size_t bones_log_size;
|
17
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,0,NULL,&bones_log_size);
|
18
|
+
bones_log = (char*)malloc((bones_log_size+1)*sizeof(char));
|
19
|
+
clGetProgramBuildInfo(bones_program,bones_device,CL_PROGRAM_BUILD_LOG,bones_log_size,bones_log, NULL);
|
20
|
+
bones_log[bones_log_size] = '\0';
|
21
|
+
//if (strcmp(bones_log,"\n") != 0 && strcmp(bones_log,"") != 0) { printf("--------- \n--- Compilation log:\n--------- \n%s\n",bones_log); }
|
22
|
+
free(bones_log);
|
23
|
+
error_check(bones_errors);
|
24
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
|
2
|
+
// End the timer for the measurement of the kernel and memory copy execution time
|
3
|
+
#if (ITERS == 1)
|
4
|
+
clFinish(bones_queue);
|
5
|
+
struct timeval bones_end_time1;
|
6
|
+
gettimeofday(&bones_end_time1, NULL);
|
7
|
+
float bones_timer1 = 0.001 * (1000000*(bones_end_time1.tv_sec-bones_start_time1.tv_sec)+bones_end_time1.tv_usec-bones_start_time1.tv_usec);
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);
|
9
|
+
#endif
|
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
// Start the timer for the measurement of the kernel execution time
|
3
|
+
clFinish(bones_queue);
|
4
|
+
for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
|
5
|
+
|
6
|
+
// Flush the CPU cache (for measurement purposes only)
|
7
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
8
|
+
int bones_flush_i;
|
9
|
+
int bones_flush_j;
|
10
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
11
|
+
for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
|
12
|
+
for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
|
13
|
+
bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
|
14
|
+
}
|
15
|
+
}
|
16
|
+
free(bones_flush_c);
|
@@ -0,0 +1,11 @@
|
|
1
|
+
|
2
|
+
}
|
3
|
+
|
4
|
+
// Stop the timer for the measurement of the kernel execution time
|
5
|
+
clFinish(bones_queue);
|
6
|
+
cl_ulong end2, start2;
|
7
|
+
bones_errors = clWaitForEvents(1, &bones_event); error_check(bones_errors);
|
8
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end2, 0); error_check(bones_errors);
|
9
|
+
bones_errors = clGetEventProfilingInfo(bones_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start2, 0); error_check(bones_errors);
|
10
|
+
float bones_timer2 = 0.000001 * (end2-start2);
|
11
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2);
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
// Store the initial value
|
3
|
+
cl_mem bones_initial_value = clCreateBuffer(bones_context,CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,sizeof(<out0_type>),<out0_name>,&bones_errors); error_check(bones_errors);
|
4
|
+
|
5
|
+
// Create the kernels
|
6
|
+
cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
|
7
|
+
cl_kernel bones_kernel_<algorithm_name>_1 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_1", &bones_errors); error_check(bones_errors);
|
8
|
+
cl_kernel bones_kernel_<algorithm_name>_2 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_2", &bones_errors); error_check(bones_errors);
|
9
|
+
|
10
|
+
// Run either one kernel or multiple kernels
|
11
|
+
if (<in0_dimensions> <= 512) {
|
12
|
+
|
13
|
+
// Set all the arguments to the kernel function
|
14
|
+
int bones_num_args = 3;
|
15
|
+
int bones_dimensions = <in0_dimensions>;
|
16
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
|
17
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
|
18
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
19
|
+
<kernel_argument_list_constants>
|
20
|
+
// Start only one kernel
|
21
|
+
const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
|
22
|
+
size_t bones_local_worksize1[] = {bones_num_threads};
|
23
|
+
size_t bones_global_worksize1[] = {bones_num_threads};
|
24
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
|
25
|
+
|
26
|
+
}
|
27
|
+
else {
|
28
|
+
|
29
|
+
// Allocate space for an intermediate array
|
30
|
+
cl_mem bones_device_temp = clCreateBuffer(bones_context,CL_MEM_READ_WRITE,128*sizeof(<out0_type>),NULL,&bones_errors); error_check(bones_errors);
|
31
|
+
|
32
|
+
// Set all the arguments to the kernel function
|
33
|
+
int bones_num_args = 3;
|
34
|
+
int bones_dimensions = <in0_dimensions>;
|
35
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,0,sizeof(bones_dimensions),(void*)&bones_dimensions);
|
36
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,1,sizeof(<in0_devicename>),(void*)&<in0_devicename>);
|
37
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_0,2,sizeof(bones_device_temp),(void*)&bones_device_temp);
|
38
|
+
<kernel_argument_list_constants>
|
39
|
+
// Start the first kernel
|
40
|
+
size_t bones_local_worksize1[] = {256};
|
41
|
+
size_t bones_global_worksize1[] = {256*128};
|
42
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize1,bones_local_worksize1,0,NULL,&bones_event); error_check(bones_errors);
|
43
|
+
|
44
|
+
// Set all the arguments to the kernel function
|
45
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_1,0,sizeof(bones_device_temp),(void*)&bones_device_temp);
|
46
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_1,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
47
|
+
// Start the second kernel
|
48
|
+
size_t bones_local_worksize2[] = {128};
|
49
|
+
size_t bones_global_worksize2[] = {128};
|
50
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_1,1,NULL,bones_global_worksize2,bones_local_worksize2,0,NULL,&bones_event); error_check(bones_errors);
|
51
|
+
clReleaseMemObject(bones_device_temp);
|
52
|
+
}
|
53
|
+
|
54
|
+
// Set all the arguments to the kernel function
|
55
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_2,0,sizeof(bones_initial_value),(void*)&bones_initial_value);
|
56
|
+
clSetKernelArg(bones_kernel_<algorithm_name>_2,1,sizeof(<out0_devicename>),(void*)&<out0_devicename>);
|
57
|
+
// Perform the last computation (only needed if there is an initial value)
|
58
|
+
size_t bones_local_worksize3[] = {1};
|
59
|
+
size_t bones_global_worksize3[] = {1};
|
60
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_2,1,NULL,bones_global_worksize3,bones_local_worksize3,0,NULL,&bones_event); error_check(bones_errors);
|
61
|
+
clReleaseMemObject(bones_initial_value);
|
62
|
+
|
63
|
+
// Synchronize and clean-up the kernels
|
64
|
+
clFinish(bones_queue);
|
65
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_0);
|
66
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_1);
|
67
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_2);
|