bones-compiler 1.1.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG +37 -0
- data/LICENSE +1 -1
- data/README.rdoc +95 -70
- data/Rakefile +78 -3
- data/VERSION +1 -1
- data/bin/adarwin +17 -0
- data/examples/benchmarks/PolyBench/2mm.c +104 -0
- data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
- data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
- data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
- data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
- data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
- data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
- data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
- data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
- data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
- data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
- data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
- data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
- data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
- data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
- data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
- data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
- data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
- data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
- data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
- data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
- data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
- data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
- data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
- data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
- data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
- data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
- data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
- data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
- data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
- data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
- data/examples/benchmarks/Rodinia/cfd.c +180 -0
- data/examples/benchmarks/Rodinia/hotspot.c +228 -0
- data/examples/benchmarks/Rodinia/kmeans.c +164 -0
- data/examples/benchmarks/Rodinia/srad.c +188 -0
- data/examples/benchmarks/other/common.h +0 -0
- data/examples/benchmarks/other/dct.c +58 -0
- data/examples/benchmarks/other/mm.c +50 -0
- data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
- data/examples/chunk/{example1.c → example01.c} +0 -0
- data/examples/chunk/{example2.c → example02.c} +0 -0
- data/examples/chunk/{example3.c → example03.c} +0 -0
- data/examples/chunk/{example4.c → example04.c} +0 -0
- data/examples/chunk/{example5.c → example05.c} +0 -0
- data/examples/chunk/example06.c +45 -0
- data/examples/chunk/example07.c +49 -0
- data/examples/dependences/example01.c +42 -0
- data/examples/dependences/example02.c +40 -0
- data/examples/dependences/example03.c +43 -0
- data/examples/dependences/example04.c +44 -0
- data/examples/dependences/example05.c +42 -0
- data/examples/element/{example1.c → example01.c} +0 -0
- data/examples/element/{example2.c → example02.c} +2 -2
- data/examples/element/{example3.c → example03.c} +0 -0
- data/examples/element/{example4.c → example04.c} +0 -0
- data/examples/element/{example5.c → example05.c} +0 -0
- data/examples/element/{example6.c → example06.c} +0 -0
- data/examples/element/{example7.c → example07.c} +0 -0
- data/examples/element/{example8.c → example08.c} +0 -0
- data/examples/element/{example9.c → example09.c} +0 -0
- data/examples/element/example13.c +73 -0
- data/examples/fusion/example01.c +68 -0
- data/examples/fusion/example02.c +73 -0
- data/examples/fusion/example03.c +72 -0
- data/examples/fusion/example04.c +61 -0
- data/examples/fusion/example05.c +55 -0
- data/examples/neighbourhood/{example1.c → example01.c} +0 -0
- data/examples/neighbourhood/{example2.c → example02.c} +0 -0
- data/examples/neighbourhood/{example3.c → example03.c} +0 -0
- data/examples/neighbourhood/{example4.c → example04.c} +0 -0
- data/examples/neighbourhood/example05.c +44 -0
- data/examples/shared/{example1.c → example01.c} +0 -0
- data/examples/shared/{example2.c → example02.c} +0 -0
- data/examples/shared/{example3.c → example03.c} +0 -0
- data/examples/shared/{example4.c → example04.c} +0 -0
- data/examples/shared/{example5.c → example05.c} +0 -0
- data/lib/adarwin.rb +62 -0
- data/lib/adarwin/dependences.rb +268 -0
- data/lib/adarwin/engine.rb +277 -0
- data/lib/adarwin/fusion.rb +174 -0
- data/lib/adarwin/interval.rb +57 -0
- data/lib/adarwin/memorycopies.rb +153 -0
- data/lib/adarwin/nest.rb +225 -0
- data/lib/adarwin/preprocessor.rb +76 -0
- data/lib/adarwin/reference.rb +261 -0
- data/lib/bones.rb +4 -55
- data/lib/bones/algorithm.rb +77 -40
- data/lib/bones/copy.rb +26 -0
- data/lib/bones/engine.rb +147 -31
- data/lib/bones/preprocessor.rb +92 -12
- data/lib/bones/species.rb +4 -3
- data/lib/bones/structure.rb +14 -4
- data/lib/castaddon.rb +11 -6
- data/lib/castaddon/node_adarwin.rb +245 -0
- data/lib/castaddon/node_bones.rb +316 -0
- data/lib/castaddon/node_common.rb +289 -0
- data/lib/castaddon/transformations.rb +236 -0
- data/lib/common.rb +216 -0
- data/skeletons/CPU-C/common/header.c +3 -0
- data/skeletons/CPU-C/common/mem_global.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +11 -13
- data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
- data/skeletons/CPU-C/common/timer_globals.c +29 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
- data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
- data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
- data/skeletons/CPU-OPENMP/common/globals.c +1 -0
- data/skeletons/CPU-OPENMP/common/header.c +3 -0
- data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
- data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
- data/skeletons/GPU-CUDA/common/globals.c +27 -3
- data/skeletons/GPU-CUDA/common/header.c +2 -0
- data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
- data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
- data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
- data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
- data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
- data/skeletons/GPU-CUDA/skeletons.txt +6 -5
- data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
- data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
- data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
- data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
- data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
- data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
- data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
- data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
- data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
- data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
- data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
- data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
- data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
- data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
- data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
- data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
- data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
- data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
- data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
- data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
- data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
- data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
- data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
- data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
- data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
- data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
- data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
- data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
- data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
- data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
- data/test/examples/chunk/example01_species.c +58 -0
- data/test/examples/chunk/example02_species.c +48 -0
- data/test/examples/chunk/example03_species.c +63 -0
- data/test/examples/chunk/example04_species.c +58 -0
- data/test/examples/chunk/example05_species.c +56 -0
- data/test/examples/chunk/example06_species.c +49 -0
- data/test/examples/chunk/example07_species.c +53 -0
- data/test/examples/dependences/example01_species.c +46 -0
- data/test/examples/dependences/example02_species.c +44 -0
- data/test/examples/dependences/example03_species.c +47 -0
- data/test/examples/dependences/example04_species.c +48 -0
- data/test/examples/dependences/example05_species.c +46 -0
- data/test/examples/element/example01_species.c +50 -0
- data/test/examples/element/example02_species.c +50 -0
- data/test/examples/element/example03_species.c +62 -0
- data/test/examples/element/example04_species.c +53 -0
- data/test/examples/element/example05_species.c +59 -0
- data/test/examples/element/example06_species.c +50 -0
- data/test/examples/element/example07_species.c +58 -0
- data/test/examples/element/example08_species.c +49 -0
- data/test/examples/element/example09_species.c +52 -0
- data/test/examples/element/example10_species.c +54 -0
- data/test/examples/element/example11_species.c +51 -0
- data/test/examples/element/example12_species.c +60 -0
- data/test/examples/element/example13_species.c +77 -0
- data/test/examples/neighbourhood/example01_species.c +57 -0
- data/test/examples/neighbourhood/example02_species.c +56 -0
- data/test/examples/neighbourhood/example03_species.c +83 -0
- data/test/examples/neighbourhood/example04_species.c +55 -0
- data/test/examples/neighbourhood/example05_species.c +48 -0
- data/test/examples/shared/example01_species.c +49 -0
- data/test/examples/shared/example02_species.c +55 -0
- data/test/examples/shared/example03_species.c +59 -0
- data/test/examples/shared/example04_species.c +56 -0
- data/test/examples/shared/example05_species.c +52 -0
- metadata +193 -73
- data/examples/benchmarks/overview.txt +0 -38
- data/lib/castaddon/node.rb +0 -753
|
@@ -1,24 +1,42 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
|
|
2
|
+
////////////////////////////////////////
|
|
3
|
+
//////////// Globals ///////////////////
|
|
4
|
+
////////////////////////////////////////
|
|
3
5
|
|
|
4
6
|
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
|
5
7
|
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
|
6
8
|
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
|
7
9
|
#define DIV_FLOOR(a,b) (a/b)
|
|
8
10
|
|
|
9
|
-
//
|
|
11
|
+
// CUDA timers
|
|
12
|
+
cudaEvent_t bones_start2;
|
|
13
|
+
cudaEvent_t bones_stop2;
|
|
14
|
+
|
|
15
|
+
// Function to initialize the GPU (for fair measurements, streams, timers)
|
|
10
16
|
void bones_initialize_target(void) {
|
|
11
17
|
int* bones_temporary = 0;
|
|
12
18
|
cudaMalloc((void**)&bones_temporary, sizeof(int));
|
|
13
19
|
cudaFree(bones_temporary);
|
|
20
|
+
cudaStreamCreate(&kernel_stream);
|
|
21
|
+
cudaEventCreate(&bones_start2);
|
|
22
|
+
cudaEventCreate(&bones_stop2);
|
|
14
23
|
}
|
|
15
24
|
|
|
16
25
|
// Declaration of the original function
|
|
17
26
|
int bones_main(void);
|
|
18
27
|
|
|
28
|
+
////////////////////////////////////////
|
|
29
|
+
//////////// Main function /////////////
|
|
30
|
+
////////////////////////////////////////
|
|
31
|
+
|
|
19
32
|
// New main function for initialisation and clean-up
|
|
20
33
|
int main(void) {
|
|
21
34
|
|
|
35
|
+
// Initialisation of the scheduler
|
|
36
|
+
bones_initialize_scheduler();
|
|
37
|
+
pthread_t bones_scheduler_thread;
|
|
38
|
+
pthread_create(&bones_scheduler_thread, NULL, bones_scheduler, NULL);
|
|
39
|
+
|
|
22
40
|
// Initialisation of the target
|
|
23
41
|
bones_initialize_target();
|
|
24
42
|
|
|
@@ -26,6 +44,12 @@ int main(void) {
|
|
|
26
44
|
int bones_return = bones_main();
|
|
27
45
|
|
|
28
46
|
// Clean-up
|
|
47
|
+
bones_scheduler_done = 1;
|
|
48
|
+
pthread_join(bones_scheduler_thread, NULL);
|
|
49
|
+
cudaStreamDestroy(kernel_stream);
|
|
29
50
|
return bones_return;
|
|
30
51
|
}
|
|
31
52
|
|
|
53
|
+
////////////////////////////////////////
|
|
54
|
+
////////// Accelerated functions ///////
|
|
55
|
+
////////////////////////////////////////
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
|
|
2
|
+
// Copy <array> from device to host
|
|
3
|
+
void bones_copy<direction>_<id>_<array>(<definition>) {
|
|
4
|
+
cudaStreamSynchronize(kernel_stream);
|
|
5
|
+
bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <index>);
|
|
6
|
+
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
|
|
2
2
|
// Copy <array> from device to host
|
|
3
|
-
|
|
3
|
+
bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <state>);
|
|
4
|
+
bones_synchronize(<state>);
|
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
|
|
2
2
|
// Copy <array> to the device
|
|
3
|
-
|
|
3
|
+
bones_memcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice, <state>, <state>);
|
|
4
|
+
bones_synchronize(<state>);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
<type>* device_<array>;
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
|
|
2
2
|
// Create space for <array> on the device
|
|
3
|
-
<type>* device_<array> = 0;
|
|
4
3
|
cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
|
|
5
|
-
cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
|
|
4
|
+
//cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
|
|
2
|
+
#include <stdio.h>
|
|
3
|
+
#include <pthread.h>
|
|
4
|
+
|
|
5
|
+
////////////////////////////////////////
|
|
6
|
+
////////// Thread scheduler ////////////
|
|
7
|
+
////////////////////////////////////////
|
|
8
|
+
|
|
9
|
+
// Memory copy and kernel streams
|
|
10
|
+
cudaStream_t kernel_stream;
|
|
11
|
+
cudaStream_t memory_stream;
|
|
12
|
+
|
|
13
|
+
// Task structure
|
|
14
|
+
typedef struct {
|
|
15
|
+
void *dst;
|
|
16
|
+
void *src;
|
|
17
|
+
int size;
|
|
18
|
+
enum cudaMemcpyKind direction;
|
|
19
|
+
int deadline;
|
|
20
|
+
volatile int status;
|
|
21
|
+
} Task;
|
|
22
|
+
|
|
23
|
+
// Task list
|
|
24
|
+
#define BONES_MAX_TASKS 100
|
|
25
|
+
Task tasks[BONES_MAX_TASKS];
|
|
26
|
+
|
|
27
|
+
// Scheduler status
|
|
28
|
+
volatile int bones_scheduler_done;
|
|
29
|
+
|
|
30
|
+
// Create synchronisation points
|
|
31
|
+
void bones_synchronize(int deadline) {
|
|
32
|
+
cudaStreamSynchronize(kernel_stream);
|
|
33
|
+
printf("Reached: syncpoint %d [worker]\n",deadline); fflush(stdout);
|
|
34
|
+
for (int t = 0; t <= BONES_MAX_TASKS; t++) {
|
|
35
|
+
if (tasks[t].deadline == deadline && tasks[t].status == 1) {
|
|
36
|
+
while(tasks[t].status != 2) { }
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
printf("Reached: syncpoint %d [all]\n",deadline); fflush(stdout);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Add a new task
|
|
43
|
+
void bones_memcpy(void *dst, void *src, int size, enum cudaMemcpyKind direction, int deadline, int task_id) {
|
|
44
|
+
Task new_task = { .dst = dst, .src = src, .size = size, .direction = direction, .deadline = deadline, .status = 1 };
|
|
45
|
+
tasks[task_id] = new_task;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Perform a task (CUDA memory copy)
|
|
49
|
+
void bones_scheduler_copy(Task current_task) {
|
|
50
|
+
usleep(400);
|
|
51
|
+
cudaMemcpyAsync(current_task.dst, current_task.src, current_task.size, current_task.direction, memory_stream);
|
|
52
|
+
cudaStreamSynchronize(memory_stream);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Initialize the scheduler
|
|
56
|
+
void bones_initialize_scheduler(void) {
|
|
57
|
+
bones_scheduler_done = 0;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// The scheduler (infinite loop)
|
|
61
|
+
#define LARGE_INT 1000
|
|
62
|
+
void* bones_scheduler(void* ptr) {
|
|
63
|
+
cudaStreamCreate(&memory_stream);
|
|
64
|
+
while (bones_scheduler_done != 1) {
|
|
65
|
+
|
|
66
|
+
// Find the ready task with the earliest deadline
|
|
67
|
+
int found_deadline = LARGE_INT;
|
|
68
|
+
int found_task = LARGE_INT;
|
|
69
|
+
for (int t = 0; t <= BONES_MAX_TASKS; t++) {
|
|
70
|
+
if (tasks[t].status == 1) {
|
|
71
|
+
if (tasks[t].deadline < found_deadline) {
|
|
72
|
+
found_task = t;
|
|
73
|
+
found_deadline = tasks[t].deadline;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Perform the found task
|
|
79
|
+
if (found_task != LARGE_INT) {
|
|
80
|
+
printf("Performing task %d, dl %d [scheduler]\n",found_task,tasks[found_task].deadline);
|
|
81
|
+
bones_scheduler_copy(tasks[found_task]);
|
|
82
|
+
tasks[found_task].status = 2;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
cudaStreamDestroy(memory_stream);
|
|
86
|
+
}
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
2
|
// Start the timer for the measurement of the kernel execution time
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
cudaEventCreate(&bones_start2);
|
|
6
|
-
cudaEventRecord(bones_start2,0);
|
|
3
|
+
//cudaStreamSynchronize(kernel_stream);
|
|
4
|
+
cudaEventRecord(bones_start2,kernel_stream);
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
|
|
2
2
|
// Stop the timer for the measurement of the kernel execution time
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
cudaEventCreate(&bones_stop2);
|
|
6
|
-
cudaEventRecord(bones_stop2,0);
|
|
3
|
+
//cudaStreamSynchronize(kernel_stream);
|
|
4
|
+
cudaEventRecord(bones_stop2,kernel_stream);
|
|
7
5
|
cudaEventSynchronize(bones_stop2);
|
|
8
6
|
float bones_timer2 = 0;
|
|
9
7
|
cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
|
|
10
|
-
printf(">>>\t\t
|
|
8
|
+
printf(">>>\t\t Execution time [kernel <algorithm_basename>]: %.3lf ms \n", bones_timer2);
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
|
|
2
|
+
////////////////////////////////////////
|
|
3
|
+
//////////// Timers ////////////////////
|
|
4
|
+
////////////////////////////////////////
|
|
5
|
+
|
|
6
|
+
// Timer
|
|
7
|
+
cudaEvent_t bones_start1;
|
|
8
|
+
|
|
9
|
+
// Start the timer for the measurement of the whole scop
|
|
10
|
+
void bones_timer_start() {
|
|
11
|
+
cudaDeviceSynchronize();
|
|
12
|
+
cudaEventCreate(&bones_start1);
|
|
13
|
+
cudaEventRecord(bones_start1,kernel_stream);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
// End the timer for the measurement of the whole scop
|
|
17
|
+
void bones_timer_stop() {
|
|
18
|
+
cudaDeviceSynchronize();
|
|
19
|
+
cudaEvent_t bones_stop1;
|
|
20
|
+
cudaEventCreate(&bones_stop1);
|
|
21
|
+
cudaEventRecord(bones_stop1,kernel_stream);
|
|
22
|
+
cudaEventSynchronize(bones_stop1);
|
|
23
|
+
float bones_timer1 = 0;
|
|
24
|
+
cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
|
|
25
|
+
printf(">>>\t\t Execution time [full scop]: %.3lf ms \n", bones_timer1);
|
|
26
|
+
}
|
|
@@ -72,11 +72,9 @@ __global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <i
|
|
|
72
72
|
// Function to start the kernel
|
|
73
73
|
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
74
74
|
int bones_block_size;
|
|
75
|
-
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
|
76
|
-
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
|
77
|
-
else
|
|
78
|
-
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
|
79
|
-
else { bones_block_size = 32; }
|
|
75
|
+
if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
|
|
76
|
+
else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
|
|
77
|
+
else { bones_block_size = 128; }
|
|
80
78
|
|
|
81
79
|
// First perform some pre-shuffling (for the first input)
|
|
82
80
|
<in0_type>* shuffled_<in0_name> = 0;
|
|
@@ -86,7 +84,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
|
|
|
86
84
|
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
|
87
85
|
<in0_type>* temp_<in0_name> = <in0_name>;
|
|
88
86
|
<in0_name> = shuffled_<in0_name>;
|
|
89
|
-
cudaFree(temp_<in0_name>);
|
|
87
|
+
//cudaFree(temp_<in0_name>);
|
|
90
88
|
|
|
91
89
|
// First perform some pre-shuffling (for the second input)
|
|
92
90
|
<in0_type>* shuffled_<in1_name> = 0;
|
|
@@ -96,7 +94,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
|
|
|
96
94
|
bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
|
|
97
95
|
<in1_type>* temp_<in1_name> = <in1_name>;
|
|
98
96
|
<in1_name> = shuffled_<in1_name>;
|
|
99
|
-
cudaFree(temp_<in1_name>);
|
|
97
|
+
//cudaFree(temp_<in1_name>);
|
|
100
98
|
|
|
101
99
|
// Then run the original kernel
|
|
102
100
|
dim3 bones_threads0(bones_block_size);
|
|
@@ -46,11 +46,9 @@ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <i
|
|
|
46
46
|
// Function to start the kernel
|
|
47
47
|
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
48
48
|
int bones_block_size;
|
|
49
|
-
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
|
50
|
-
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
|
51
|
-
else
|
|
52
|
-
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
|
53
|
-
else { bones_block_size = 32; }
|
|
49
|
+
if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
|
|
50
|
+
else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
|
|
51
|
+
else { bones_block_size = 128; }
|
|
54
52
|
|
|
55
53
|
// First perform some pre-shuffling
|
|
56
54
|
<in0_type>* shuffled_<in0_name> = 0;
|
|
@@ -60,7 +58,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
|
|
|
60
58
|
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
|
61
59
|
<in0_type>* temp_<in0_name> = <in0_name>;
|
|
62
60
|
<in0_name> = shuffled_<in0_name>;
|
|
63
|
-
cudaFree(temp_<in0_name>);
|
|
61
|
+
//cudaFree(temp_<in0_name>);
|
|
64
62
|
|
|
65
63
|
// Then run the original kernel
|
|
66
64
|
dim3 bones_threads0(bones_block_size);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/* STARTDEF
|
|
2
|
-
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
|
2
|
+
void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>);
|
|
3
3
|
ENDDEF */
|
|
4
4
|
// Start of the <algorithm_name> kernel
|
|
5
5
|
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
@@ -15,14 +15,12 @@ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_d
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
// Function to start the kernel
|
|
18
|
-
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
|
18
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>) {
|
|
19
19
|
int bones_block_size;
|
|
20
|
-
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
|
21
|
-
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
|
22
|
-
else
|
|
23
|
-
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
|
24
|
-
else { bones_block_size = 32; }
|
|
20
|
+
if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
|
|
21
|
+
else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
|
|
22
|
+
else { bones_block_size = 128; }
|
|
25
23
|
dim3 bones_threads(bones_block_size);
|
|
26
24
|
dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
|
|
27
|
-
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
|
|
25
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads, 0, kernel_stream >>>(<names>, <argument_name>);
|
|
28
26
|
}
|
|
@@ -19,12 +19,13 @@ N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :defa
|
|
|
19
19
|
D|chunk(D)+ -> D|element+ :default :00
|
|
20
20
|
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
21
21
|
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
|
22
|
-
|
|
23
|
-
D|neighbourhood(D)+
|
|
24
|
-
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
|
22
|
+
D|neighbourhood(D)+ -> D|element+ :default :40
|
|
23
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :40
|
|
25
24
|
D|element+ -> D|chunk(D)+ :default :00
|
|
26
|
-
D|element+ -> D|element+ :default :
|
|
25
|
+
D|element+ -> D|element+ :default :40
|
|
27
26
|
D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
|
|
28
27
|
D|element+ -> D|shared+ :default :08
|
|
29
28
|
D|element+ -> D|element+ ^ D|shared+ :default :08
|
|
30
|
-
D|void -> D|element+ :default :
|
|
29
|
+
D|void -> D|element+ :default :40
|
|
30
|
+
|
|
31
|
+
N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
|
|
@@ -42,26 +42,30 @@ int main(void) {
|
|
|
42
42
|
for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
|
|
43
43
|
|
|
44
44
|
// Perform the computation (E := alpha*A*B*C + beta*D)
|
|
45
|
-
#pragma
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
for (
|
|
50
|
-
tmp[i][j]
|
|
45
|
+
#pragma scop
|
|
46
|
+
{
|
|
47
|
+
#pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> tmp[0:NI-1,0:NJ-1]|element
|
|
48
|
+
for (i = 0; i < NI; i++) {
|
|
49
|
+
for (j = 0; j < NJ; j++) {
|
|
50
|
+
tmp[i][j] = 0;
|
|
51
|
+
for (k = 0; k < NK; k++) {
|
|
52
|
+
tmp[i][j] += alpha * A[i][k] * B[k][j];
|
|
53
|
+
}
|
|
51
54
|
}
|
|
52
55
|
}
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
56
|
+
#pragma species endkernel 2mm_k1
|
|
57
|
+
#pragma species kernel D[0:NI-1,0:NL-1]|element ^ tmp[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ C[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> D[0:NI-1,0:NL-1]|element
|
|
58
|
+
for (i = 0; i < NI; i++) {
|
|
59
|
+
for (j = 0; j < NL; j++) {
|
|
60
|
+
D[i][j] *= beta;
|
|
61
|
+
for (k = 0; k < NJ; k++) {
|
|
62
|
+
D[i][j] += tmp[i][k] * C[k][j];
|
|
63
|
+
}
|
|
61
64
|
}
|
|
62
65
|
}
|
|
66
|
+
#pragma species endkernel 2mm_k2
|
|
63
67
|
}
|
|
64
|
-
#pragma
|
|
68
|
+
#pragma endscop
|
|
65
69
|
|
|
66
70
|
// Clean-up and exit the function
|
|
67
71
|
fflush(stdout);
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
//
|
|
2
|
+
// This file is part of the Bones source-to-source compiler examples. The C-code
|
|
3
|
+
// is largely identical in terms of functionality and variable naming to the code
|
|
4
|
+
// found in PolyBench/C version 3.2. For more information on PolyBench/C or Bones
|
|
5
|
+
// please use the contact information below.
|
|
6
|
+
//
|
|
7
|
+
// == More information on PolyBench/C
|
|
8
|
+
// Contact............Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
|
9
|
+
// Web address........http://polybench.sourceforge.net/
|
|
10
|
+
//
|
|
11
|
+
// == More information on Bones
|
|
12
|
+
// Contact............Cedric Nugteren <c.nugteren@tue.nl>
|
|
13
|
+
// Web address........http://parse.ele.tue.nl/bones/
|
|
14
|
+
//
|
|
15
|
+
// == File information
|
|
16
|
+
// Filename...........benchmark/3mm.c
|
|
17
|
+
// Author.............Cedric Nugteren
|
|
18
|
+
// Last modified on...03-April-2012
|
|
19
|
+
//
|
|
20
|
+
|
|
21
|
+
#include "common.h"
|
|
22
|
+
|
|
23
|
+
// This is '3mm', a 3 matrix multiply kernel
|
|
24
|
+
int main(void) {
|
|
25
|
+
int i,j,k;
|
|
26
|
+
|
|
27
|
+
// Declare arrays on the stack
|
|
28
|
+
float A[NI][NK];
|
|
29
|
+
float B[NK][NJ];
|
|
30
|
+
float C[NJ][NM];
|
|
31
|
+
float D[NM][NL];
|
|
32
|
+
float E[NI][NJ];
|
|
33
|
+
float F[NJ][NL];
|
|
34
|
+
float G[NI][NL];
|
|
35
|
+
|
|
36
|
+
// Set the input data
|
|
37
|
+
for (i=0; i<NI; i++) { for (j=0; j<NK; j++) { A[i][j] = ((float) i*j) / NI; } }
|
|
38
|
+
for (i=0; i<NK; i++) { for (j=0; j<NJ; j++) { B[i][j] = ((float) i*(j+1)) / NJ; } }
|
|
39
|
+
for (i=0; i<NL; i++) { for (j=0; j<NJ; j++) { C[i][j] = ((float) i*(j+3)) / NL; } }
|
|
40
|
+
for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
|
|
41
|
+
|
|
42
|
+
// Perform the computation (G := E*F, with E := A*B and F := C*D)
|
|
43
|
+
#pragma scop
|
|
44
|
+
{
|
|
45
|
+
#pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> E[0:NI-1,0:NJ-1]|element
|
|
46
|
+
for (i = 0; i < NI; i++) {
|
|
47
|
+
for (j = 0; j < NJ; j++) {
|
|
48
|
+
E[i][j] = 0;
|
|
49
|
+
for (k = 0; k < NK; k++) {
|
|
50
|
+
E[i][j] += A[i][k] * B[k][j];
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
#pragma species endkernel 3mm_k1
|
|
55
|
+
#pragma species kernel C[0:NJ-1,0:NM-1]|chunk(0:0,0:NM-1) ^ D[0:NM-1,0:NL-1]|chunk(0:NM-1,0:0) -> F[0:NJ-1,0:NL-1]|element
|
|
56
|
+
for (i = 0; i < NJ; i++) {
|
|
57
|
+
for (j = 0; j < NL; j++) {
|
|
58
|
+
F[i][j] = 0;
|
|
59
|
+
for (k = 0; k < NM; k++) {
|
|
60
|
+
F[i][j] += C[i][k] * D[k][j];
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
#pragma species endkernel 3mm_k2
|
|
65
|
+
#pragma species kernel E[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ F[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> G[0:NI-1,0:NL-1]|element
|
|
66
|
+
for (i = 0; i < NI; i++) {
|
|
67
|
+
for (j = 0; j < NL; j++) {
|
|
68
|
+
G[i][j] = 0;
|
|
69
|
+
for (k = 0; k < NJ; k++) {
|
|
70
|
+
G[i][j] += E[i][k] * F[k][j];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
#pragma species endkernel 3mm_k3
|
|
75
|
+
}
|
|
76
|
+
#pragma endscop
|
|
77
|
+
|
|
78
|
+
// Clean-up and exit the function
|
|
79
|
+
fflush(stdout);
|
|
80
|
+
return 0;
|
|
81
|
+
}
|
|
82
|
+
|