bones-compiler 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +117 -0
- data/LICENSE +9 -0
- data/README.rdoc +126 -0
- data/Rakefile +107 -0
- data/VERSION +1 -0
- data/bin/bones +20 -0
- data/examples/applications/ffos.c +552 -0
- data/examples/benchmarks/2mm.c +70 -0
- data/examples/benchmarks/3mm.c +81 -0
- data/examples/benchmarks/adi.c +81 -0
- data/examples/benchmarks/atax.c +65 -0
- data/examples/benchmarks/bicg.c +67 -0
- data/examples/benchmarks/cholesky.c +64 -0
- data/examples/benchmarks/common.h +168 -0
- data/examples/benchmarks/correlation.c +97 -0
- data/examples/benchmarks/covariance.c +77 -0
- data/examples/benchmarks/doitgen.c +63 -0
- data/examples/benchmarks/durbin.c +76 -0
- data/examples/benchmarks/dynprog.c +67 -0
- data/examples/benchmarks/fdtd-2d-apml.c +114 -0
- data/examples/benchmarks/fdtd-2d.c +74 -0
- data/examples/benchmarks/floyd-warshall.c +50 -0
- data/examples/benchmarks/gemm.c +69 -0
- data/examples/benchmarks/gemver.c +89 -0
- data/examples/benchmarks/gesummv.c +64 -0
- data/examples/benchmarks/gramschmidt.c +84 -0
- data/examples/benchmarks/jacobi-1d-imper.c +55 -0
- data/examples/benchmarks/jacobi-2d-imper.c +61 -0
- data/examples/benchmarks/lu.c +57 -0
- data/examples/benchmarks/ludcmp.c +91 -0
- data/examples/benchmarks/mvt.c +65 -0
- data/examples/benchmarks/overview.txt +38 -0
- data/examples/benchmarks/reg_detect.c +82 -0
- data/examples/benchmarks/saxpy.c +45 -0
- data/examples/benchmarks/seidel-2d.c +51 -0
- data/examples/benchmarks/symm.c +74 -0
- data/examples/benchmarks/syr2k.c +65 -0
- data/examples/benchmarks/syrk.c +62 -0
- data/examples/benchmarks/trisolv.c +57 -0
- data/examples/benchmarks/trmm.c +57 -0
- data/examples/chunk/example1.c +54 -0
- data/examples/chunk/example2.c +44 -0
- data/examples/chunk/example3.c +59 -0
- data/examples/chunk/example4.c +55 -0
- data/examples/chunk/example5.c +52 -0
- data/examples/element/example1.c +46 -0
- data/examples/element/example10.c +50 -0
- data/examples/element/example11.c +47 -0
- data/examples/element/example12.c +56 -0
- data/examples/element/example2.c +46 -0
- data/examples/element/example3.c +58 -0
- data/examples/element/example4.c +49 -0
- data/examples/element/example5.c +56 -0
- data/examples/element/example6.c +46 -0
- data/examples/element/example7.c +54 -0
- data/examples/element/example8.c +45 -0
- data/examples/element/example9.c +48 -0
- data/examples/neighbourhood/example1.c +54 -0
- data/examples/neighbourhood/example2.c +55 -0
- data/examples/neighbourhood/example3.c +82 -0
- data/examples/neighbourhood/example4.c +52 -0
- data/examples/shared/example1.c +45 -0
- data/examples/shared/example2.c +51 -0
- data/examples/shared/example3.c +55 -0
- data/examples/shared/example4.c +52 -0
- data/examples/shared/example5.c +48 -0
- data/lib/bones.rb +266 -0
- data/lib/bones/algorithm.rb +541 -0
- data/lib/bones/engine.rb +386 -0
- data/lib/bones/preprocessor.rb +161 -0
- data/lib/bones/species.rb +196 -0
- data/lib/bones/structure.rb +94 -0
- data/lib/bones/variable.rb +169 -0
- data/lib/bones/variablelist.rb +72 -0
- data/lib/castaddon.rb +27 -0
- data/lib/castaddon/index.rb +40 -0
- data/lib/castaddon/node.rb +753 -0
- data/lib/castaddon/type.rb +37 -0
- data/skeletons/CPU-C/common/epilogue.c +0 -0
- data/skeletons/CPU-C/common/globals.c +17 -0
- data/skeletons/CPU-C/common/globals_kernel.c +1 -0
- data/skeletons/CPU-C/common/header.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-C/common/mem_prologue.c +3 -0
- data/skeletons/CPU-C/common/prologue.c +0 -0
- data/skeletons/CPU-C/common/timer_1_start.c +0 -0
- data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +20 -0
- data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-C/kernel/default.host.c +3 -0
- data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
- data/skeletons/CPU-C/skeletons.txt +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
- data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
- data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
- data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
- data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
- data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/globals.c +37 -0
- data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
- data/skeletons/CPU-OPENMP/common/header.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
- data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
- data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
- data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
- data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
- data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
- data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
- data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
- data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
- data/skeletons/GPU-CUDA/common/globals.c +31 -0
- data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
- data/skeletons/GPU-CUDA/common/header.c +0 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
- data/skeletons/GPU-CUDA/common/prologue.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
- data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
- data/skeletons/GPU-CUDA/skeletons.txt +30 -0
- data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
- data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
- data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
- data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
- data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
- data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
- data/skeletons/verification/header.c +2 -0
- data/skeletons/verification/timer_start.c +4 -0
- data/skeletons/verification/timer_stop.c +6 -0
- data/skeletons/verification/verify_results.c +23 -0
- data/test/bones/test_algorithm.rb +40 -0
- data/test/bones/test_common.rb +54 -0
- data/test/bones/test_preprocessor.rb +46 -0
- data/test/bones/test_species.rb +21 -0
- data/test/bones/test_variable.rb +84 -0
- data/test/test_helper.rb +106 -0
- metadata +303 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
|
|
2
|
+
// Start of the <algorithm_name> kernel (main, not unrolled kernel)
|
|
3
|
+
__kernel void bones_kernel_<algorithm_name>_0(int bones_input_size, __global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
|
4
|
+
const int bones_threadblock_work = DIV_CEIL(bones_input_size,get_num_groups(0));
|
|
5
|
+
const int bones_parallel_work = BONES_MIN(get_local_size(0),bones_threadblock_work);
|
|
6
|
+
const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
|
|
7
|
+
const int bones_local_id = get_local_id(0);
|
|
8
|
+
const int bones_global_id = get_global_id(0);
|
|
9
|
+
<ids>
|
|
10
|
+
int bones_iter_id = <in0_flatindex>;
|
|
11
|
+
|
|
12
|
+
// Load data into thread private memory and perform the first computation(s) sequentially
|
|
13
|
+
<in0_type> bones_temporary = <in0_name>[bones_iter_id];
|
|
14
|
+
<in0_type> bones_private_memory = <algorithm_code3>;
|
|
15
|
+
for(int c=1; c<bones_sequential_work; c++) {
|
|
16
|
+
bones_iter_id = bones_iter_id + bones_parallel_work*get_num_groups(0)<factors>;
|
|
17
|
+
if (bones_iter_id <= <in0_to>) {
|
|
18
|
+
bones_temporary = <in0_name>[bones_iter_id];
|
|
19
|
+
bones_private_memory = <algorithm_code1>;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
// Initialize the local memory
|
|
23
|
+
volatile __local <in0_type> bones_local_memory[256];
|
|
24
|
+
bones_local_memory[bones_local_id] = bones_private_memory;
|
|
25
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
26
|
+
|
|
27
|
+
// Perform the remainder of the computations in parallel using a parallel reduction tree
|
|
28
|
+
int bones_offset_id;
|
|
29
|
+
for (int c=256; c>=2; c=c>>1) {
|
|
30
|
+
if ((2*bones_parallel_work > c) && (get_local_id(0) < c/2)) {
|
|
31
|
+
bones_offset_id = get_local_id(0)+c/2;
|
|
32
|
+
if (bones_offset_id < bones_parallel_work) {
|
|
33
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Write the final result back to the global memory
|
|
40
|
+
if (get_local_id(0) == 0) { <out0_name>[get_group_id(0)] = bones_local_memory[0]; }
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
|
|
44
|
+
__kernel void bones_kernel_<algorithm_name>_1(__global <in0_type><in0_devicepointer> <in0_name>, __global <out0_type><out0_devicepointer> <out0_name>) {
|
|
45
|
+
const int bones_local_id = get_local_id(0);
|
|
46
|
+
const int bones_global_id = get_local_id(0);
|
|
47
|
+
|
|
48
|
+
// Initialize the local memory
|
|
49
|
+
volatile __local <in0_type> bones_local_memory[128];
|
|
50
|
+
bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
|
|
51
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
52
|
+
|
|
53
|
+
// Perform reduction using a parallel reduction tree
|
|
54
|
+
int bones_offset_id;
|
|
55
|
+
for (int c=128; c>=2; c=c>>1) {
|
|
56
|
+
if (get_local_id(0) < c/2) {
|
|
57
|
+
bones_offset_id = get_local_id(0)+c/2;
|
|
58
|
+
bones_local_memory[bones_local_id] = <algorithm_code2>;
|
|
59
|
+
}
|
|
60
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Write the final result back to the global memory
|
|
64
|
+
if (get_local_id(0) == 0) { <out0_name>[0] = bones_local_memory[0]; }
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Start of the <algorithm_name> kernel (final, initial value kernel)
|
|
68
|
+
__kernel void bones_kernel_<algorithm_name>_2(__global <out0_type><out0_devicepointer> bones_initial_value, __global <out0_type><out0_devicepointer> <out0_name>) {
|
|
69
|
+
<out0_type> bones_private_memory = <out0_name>[0];
|
|
70
|
+
<out0_type> bones_temporary = bones_initial_value[0];
|
|
71
|
+
<out0_name>[0] = <algorithm_code4>;
|
|
72
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
|
|
2
|
+
// Create the kernel
|
|
3
|
+
cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
|
|
4
|
+
|
|
5
|
+
// Set all the arguments to the kernel function
|
|
6
|
+
int bones_num_args = 0;
|
|
7
|
+
<kernel_argument_list>
|
|
8
|
+
// Start the kernel
|
|
9
|
+
size_t bones_global_worksize[] = {DIV_CEIL(<parallelism>,8)*8};
|
|
10
|
+
bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize,NULL,0,NULL,&bones_event); error_check(bones_errors);
|
|
11
|
+
|
|
12
|
+
// Synchronize and clean-up the kernel
|
|
13
|
+
clFinish(bones_queue);
|
|
14
|
+
clReleaseKernel(bones_kernel_<algorithm_name>_0);
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
|
|
2
|
+
// Start of the <algorithm_name> kernel
|
|
3
|
+
__kernel void bones_kernel_<algorithm_name>_0(<devicedefinitionsopencl>, <argument_definition>) {
|
|
4
|
+
const int bones_global_id = get_global_id(0);
|
|
5
|
+
if (bones_global_id < (<parallelism>)) {
|
|
6
|
+
|
|
7
|
+
// Calculate the global ID(s) based on the thread id
|
|
8
|
+
<ids>
|
|
9
|
+
|
|
10
|
+
// Start the computation
|
|
11
|
+
<algorithm_code1>
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
###################################################################
|
|
2
|
+
# Each line holds one mapping from species to skeleton
|
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
|
7
|
+
# '+' denotes one or more of these patterns
|
|
8
|
+
###################################################################
|
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
13
|
+
D|chunk(D)+ -> D|element+ :default :00
|
|
14
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
15
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
|
16
|
+
N|neighbourhood(N)+ -> N|element+ :default :00
|
|
17
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
|
18
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
19
|
+
D|element+ -> D|chunk(D)+ :default :00
|
|
20
|
+
D|element+ -> D|element+ :default :00
|
|
21
|
+
D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
|
|
22
|
+
D|void -> D|element+ :default :00
|
|
23
|
+
|
|
24
|
+
#D|element+ -> D|shared+ :default :09
|
|
25
|
+
#D|element+ -> D|element+ ^ D|shared+ :default :09
|
|
26
|
+
|
|
File without changes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#include <omp.h>
|
|
2
|
+
#include <stdlib.h>
|
|
3
|
+
|
|
4
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
|
5
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
|
6
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
|
7
|
+
#define DIV_FLOOR(a,b) (a/b)
|
|
8
|
+
|
|
9
|
+
// Multiple iterations for kernel measurements
|
|
10
|
+
#define ITERS 1
|
|
11
|
+
|
|
12
|
+
// Function to initialize the CPU platform (for fair measurements)
|
|
13
|
+
void bones_initialize_target(void) {
|
|
14
|
+
int bones_thread_count = omp_get_num_procs();
|
|
15
|
+
omp_set_num_threads(bones_thread_count);
|
|
16
|
+
#pragma omp parallel
|
|
17
|
+
{
|
|
18
|
+
int bones_thread_id = omp_get_thread_num();
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Declaration of the original function
|
|
23
|
+
int bones_main(void);
|
|
24
|
+
|
|
25
|
+
// New main function for initialisation and clean-up
|
|
26
|
+
int main(void) {
|
|
27
|
+
|
|
28
|
+
// Initialisation
|
|
29
|
+
bones_initialize_target();
|
|
30
|
+
|
|
31
|
+
// Original main function
|
|
32
|
+
int bones_return = bones_main();
|
|
33
|
+
|
|
34
|
+
// Clean-up
|
|
35
|
+
return bones_return;
|
|
36
|
+
}
|
|
37
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
|
|
2
|
+
// Flush the CPU cache (for measurement purposes only)
|
|
3
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
|
4
|
+
int bones_flush_i;
|
|
5
|
+
int bones_flush_j;
|
|
6
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
|
7
|
+
for (bones_flush_i=0; bones_flush_i<10; bones_flush_i++) {
|
|
8
|
+
for (bones_flush_j=0; bones_flush_j<bones_flush_size; bones_flush_j++) {
|
|
9
|
+
bones_flush_c[bones_flush_j] = bones_flush_i*bones_flush_j;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
free(bones_flush_c);
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Initialize the timer
|
|
2
|
+
float bones_timer2 = 0;
|
|
3
|
+
struct timeval bones_start_time2;
|
|
4
|
+
struct timeval bones_end_time2;
|
|
5
|
+
for (int bones_iter=0; bones_iter<ITERS; bones_iter++) {
|
|
6
|
+
|
|
7
|
+
// Flush the CPU cache (for measurement purposes only)
|
|
8
|
+
const int bones_flush_size = 4*1024*1024; // (16MB)
|
|
9
|
+
char *bones_flush_c = (char *)malloc(bones_flush_size);
|
|
10
|
+
for (int i=0; i<10; i++) {
|
|
11
|
+
for (int j=0; j<bones_flush_size; j++) {
|
|
12
|
+
bones_flush_c[j] = i*j;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
free(bones_flush_c);
|
|
16
|
+
|
|
17
|
+
// Start the timer for the measurement of the kernel execution time
|
|
18
|
+
gettimeofday(&bones_start_time2, NULL);
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
|
|
2
|
+
// Stop the timer for the measurement of the kernel execution time
|
|
3
|
+
gettimeofday(&bones_end_time2, NULL);
|
|
4
|
+
bones_timer2 += 0.001 * (1000000*(bones_end_time2.tv_sec-bones_start_time2.tv_sec)+bones_end_time2.tv_usec-bones_start_time2.tv_usec);
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
// Print the measurement data
|
|
8
|
+
printf(">>>\t\t (<algorithm_basename>): Execution time [kernel ]: %.3lf ms \n", bones_timer2/((float)ITERS));
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
if (<in0_dimensions> > 0) {
|
|
3
|
+
|
|
4
|
+
// Store the initial value
|
|
5
|
+
<out0_type> bones_initial_value = <out0_name>[0];
|
|
6
|
+
|
|
7
|
+
// Create a temporary array to store intermediate data
|
|
8
|
+
int bones_thread_count = BONES_MIN(omp_get_num_procs(),<in0_dimensions>);
|
|
9
|
+
<out0_type>* bones_temporary = (<out0_type>*)malloc(bones_thread_count*sizeof(<out0_type>));
|
|
10
|
+
|
|
11
|
+
// Run multiple OpenMP threads
|
|
12
|
+
omp_set_num_threads(bones_thread_count);
|
|
13
|
+
#pragma omp parallel
|
|
14
|
+
{
|
|
15
|
+
int bones_thread_id = omp_get_thread_num();
|
|
16
|
+
|
|
17
|
+
// Perform the major part of the computation in parallel
|
|
18
|
+
bones_kernel_<algorithm_name>_0(bones_thread_id, bones_thread_count, <in0_dimensions>, <in_devicenames>, bones_temporary, <argument_name>);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Compute the second part of the algorithm with only one thread
|
|
22
|
+
bones_kernel_<algorithm_name>_1(bones_thread_count, bones_temporary, <out_devicenames>, <argument_name>);
|
|
23
|
+
free(bones_temporary);
|
|
24
|
+
|
|
25
|
+
// Perform the last computation (only needed if there is an initial value)
|
|
26
|
+
bones_kernel_<algorithm_name>_2(bones_initial_value,<out0_name>,<argument_name>);
|
|
27
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, int bones_size, <devicedefinitions>, <argument_definition>);
|
|
3
|
+
void bones_kernel_<algorithm_name>_1(int bones_size, <devicedefinitions>, <argument_definition>);
|
|
4
|
+
void bones_kernel_<algorithm_name>_2(<out0_type> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>);
|
|
5
|
+
ENDDEF */
|
|
6
|
+
// Start of the <algorithm_name> kernel (main part)
|
|
7
|
+
void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, int bones_size, <devicedefinitions>, <argument_definition>) {
|
|
8
|
+
const int bones_work = DIV_CEIL(bones_size,bones_thread_count);
|
|
9
|
+
const int bones_global_id = bones_thread_id;
|
|
10
|
+
<ids>
|
|
11
|
+
int bones_iter_id = <in0_flatindex>;
|
|
12
|
+
|
|
13
|
+
// Use a thread private memory to perform the per-thread computation(s)
|
|
14
|
+
<in0_type> bones_temporary = <in0_name>[bones_iter_id];
|
|
15
|
+
<in0_type> bones_private_memory = <algorithm_code2>;
|
|
16
|
+
for(int c=1; c<bones_work; c++) {
|
|
17
|
+
bones_iter_id = bones_iter_id + bones_thread_count<factors>;
|
|
18
|
+
if (bones_iter_id <= <in0_to>) {
|
|
19
|
+
bones_temporary = <in0_name>[bones_iter_id];
|
|
20
|
+
bones_private_memory = <algorithm_code1>;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Store the result
|
|
25
|
+
<out0_name>[bones_thread_id] = bones_private_memory;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Start of the <algorithm_name> kernel (secondary part)
|
|
29
|
+
void bones_kernel_<algorithm_name>_1(int bones_size, <devicedefinitions>, <argument_definition>) {
|
|
30
|
+
|
|
31
|
+
// Use a private memory to perform the sequential computation(s)
|
|
32
|
+
<in0_type> bones_private_memory = <in0_name>[0];
|
|
33
|
+
for(int bones_iter_id=1; bones_iter_id<bones_size; bones_iter_id++) {
|
|
34
|
+
bones_private_memory = bones_private_memory + <in0_name>[bones_iter_id];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Store the result
|
|
38
|
+
<out0_name>[0] = bones_private_memory;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Start of the <algorithm_name> kernel (final, initial value kernel)
|
|
42
|
+
void bones_kernel_<algorithm_name>_2(<out0_type> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
|
|
43
|
+
<out0_type> bones_private_memory = <out0_name>[0];
|
|
44
|
+
<out0_type> bones_temporary = bones_initial_value;
|
|
45
|
+
<out0_name>[0] = <algorithm_code3>;
|
|
46
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
|
|
2
|
+
// Run multiple OpenMP threads
|
|
3
|
+
int bones_thread_count = omp_get_num_procs();
|
|
4
|
+
omp_set_num_threads(bones_thread_count);
|
|
5
|
+
#pragma omp parallel
|
|
6
|
+
{
|
|
7
|
+
int bones_thread_id = omp_get_thread_num();
|
|
8
|
+
|
|
9
|
+
// Start the kernel
|
|
10
|
+
bones_kernel_<algorithm_name>_0(bones_thread_id, bones_thread_count, <devicenames>, <argument_name>);
|
|
11
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/* STARTDEF
|
|
2
|
+
void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, <devicedefinitions>, <argument_definition>);
|
|
3
|
+
ENDDEF */
|
|
4
|
+
// Start of the <algorithm_name> kernel
|
|
5
|
+
void bones_kernel_<algorithm_name>_0(int bones_thread_id, int bones_thread_count, <devicedefinitions>, <argument_definition>) {
|
|
6
|
+
int bones_workload = DIV_CEIL(<parallelism>,bones_thread_count);
|
|
7
|
+
int bones_start = bones_thread_id*bones_workload;
|
|
8
|
+
int bones_end = BONES_MIN((bones_thread_id+1)*bones_workload,<parallelism>);
|
|
9
|
+
for(int bones_global_id=bones_start; bones_global_id<bones_end; bones_global_id++) {
|
|
10
|
+
|
|
11
|
+
// Calculate the global ID(s) based on the thread id
|
|
12
|
+
<ids>
|
|
13
|
+
|
|
14
|
+
// Perform the main computation
|
|
15
|
+
<algorithm_code1>
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
###################################################################
|
|
2
|
+
# Each line holds one mapping from species to skeleton
|
|
3
|
+
# The ordering is always ['chunk','neighbourhood','element','shared','void']
|
|
4
|
+
# The pattern 'full' is omitted from matching (will thus always match)
|
|
5
|
+
# 'D' denotes any ranges (e.g. D|element can be any dimension)
|
|
6
|
+
# 'N' denotes any range (e.g. N,N|element must be 2D)
|
|
7
|
+
# '+' denotes one or more of these patterns
|
|
8
|
+
###################################################################
|
|
9
|
+
D|chunk(D)+ -> D|chunk(D)+ :default :00
|
|
10
|
+
D|chunk(D)+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
11
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ :default :00
|
|
12
|
+
D|chunk(D)+ ^ D|element+ -> D|chunk(D)+ ^ D|element+ :default :00
|
|
13
|
+
D|chunk(D)+ -> D|element+ :default :00
|
|
14
|
+
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
15
|
+
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
|
16
|
+
N|neighbourhood(N)+ -> N|element+ :default :00
|
|
17
|
+
D|neighbourhood(D)+ -> D|element+ :default :00
|
|
18
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
19
|
+
D|element+ -> D|chunk(D)+ :default :00
|
|
20
|
+
D|element+ -> D|element+ :default :00
|
|
21
|
+
D|element -> 1|shared :D-element-to-1-shared :02 04 05
|
|
22
|
+
D|void -> D|element+ :default :00
|
|
23
|
+
|
|
24
|
+
#D|element+ -> D|shared+ :default :00
|
|
25
|
+
#D|element+ -> D|element+ ^ D|shared+ :default :00
|
|
26
|
+
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#include <stdio.h>
|
|
2
|
+
#include <cuda_runtime.h>
|
|
3
|
+
|
|
4
|
+
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
|
5
|
+
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
|
6
|
+
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
|
7
|
+
#define DIV_FLOOR(a,b) (a/b)
|
|
8
|
+
|
|
9
|
+
// Function to initialize the GPU (for fair measurements)
|
|
10
|
+
void bones_initialize_target(void) {
|
|
11
|
+
int* bones_temporary = 0;
|
|
12
|
+
cudaMalloc((void**)&bones_temporary, sizeof(int));
|
|
13
|
+
cudaFree(bones_temporary);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
// Declaration of the original function
|
|
17
|
+
int bones_main(void);
|
|
18
|
+
|
|
19
|
+
// New main function for initialisation and clean-up
|
|
20
|
+
int main(void) {
|
|
21
|
+
|
|
22
|
+
// Initialisation of the target
|
|
23
|
+
bones_initialize_target();
|
|
24
|
+
|
|
25
|
+
// Original main function
|
|
26
|
+
int bones_return = bones_main();
|
|
27
|
+
|
|
28
|
+
// Clean-up
|
|
29
|
+
return bones_return;
|
|
30
|
+
}
|
|
31
|
+
|
|
File without changes
|