bones-compiler 1.1.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/CHANGELOG +37 -0
- data/LICENSE +1 -1
- data/README.rdoc +95 -70
- data/Rakefile +78 -3
- data/VERSION +1 -1
- data/bin/adarwin +17 -0
- data/examples/benchmarks/PolyBench/2mm.c +104 -0
- data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
- data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
- data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
- data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
- data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
- data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
- data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
- data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
- data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
- data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
- data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
- data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
- data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
- data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
- data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
- data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
- data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
- data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
- data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
- data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
- data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
- data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
- data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
- data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
- data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
- data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
- data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
- data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
- data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
- data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
- data/examples/benchmarks/Rodinia/cfd.c +180 -0
- data/examples/benchmarks/Rodinia/hotspot.c +228 -0
- data/examples/benchmarks/Rodinia/kmeans.c +164 -0
- data/examples/benchmarks/Rodinia/srad.c +188 -0
- data/examples/benchmarks/other/common.h +0 -0
- data/examples/benchmarks/other/dct.c +58 -0
- data/examples/benchmarks/other/mm.c +50 -0
- data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
- data/examples/chunk/{example1.c → example01.c} +0 -0
- data/examples/chunk/{example2.c → example02.c} +0 -0
- data/examples/chunk/{example3.c → example03.c} +0 -0
- data/examples/chunk/{example4.c → example04.c} +0 -0
- data/examples/chunk/{example5.c → example05.c} +0 -0
- data/examples/chunk/example06.c +45 -0
- data/examples/chunk/example07.c +49 -0
- data/examples/dependences/example01.c +42 -0
- data/examples/dependences/example02.c +40 -0
- data/examples/dependences/example03.c +43 -0
- data/examples/dependences/example04.c +44 -0
- data/examples/dependences/example05.c +42 -0
- data/examples/element/{example1.c → example01.c} +0 -0
- data/examples/element/{example2.c → example02.c} +2 -2
- data/examples/element/{example3.c → example03.c} +0 -0
- data/examples/element/{example4.c → example04.c} +0 -0
- data/examples/element/{example5.c → example05.c} +0 -0
- data/examples/element/{example6.c → example06.c} +0 -0
- data/examples/element/{example7.c → example07.c} +0 -0
- data/examples/element/{example8.c → example08.c} +0 -0
- data/examples/element/{example9.c → example09.c} +0 -0
- data/examples/element/example13.c +73 -0
- data/examples/fusion/example01.c +68 -0
- data/examples/fusion/example02.c +73 -0
- data/examples/fusion/example03.c +72 -0
- data/examples/fusion/example04.c +61 -0
- data/examples/fusion/example05.c +55 -0
- data/examples/neighbourhood/{example1.c → example01.c} +0 -0
- data/examples/neighbourhood/{example2.c → example02.c} +0 -0
- data/examples/neighbourhood/{example3.c → example03.c} +0 -0
- data/examples/neighbourhood/{example4.c → example04.c} +0 -0
- data/examples/neighbourhood/example05.c +44 -0
- data/examples/shared/{example1.c → example01.c} +0 -0
- data/examples/shared/{example2.c → example02.c} +0 -0
- data/examples/shared/{example3.c → example03.c} +0 -0
- data/examples/shared/{example4.c → example04.c} +0 -0
- data/examples/shared/{example5.c → example05.c} +0 -0
- data/lib/adarwin.rb +62 -0
- data/lib/adarwin/dependences.rb +268 -0
- data/lib/adarwin/engine.rb +277 -0
- data/lib/adarwin/fusion.rb +174 -0
- data/lib/adarwin/interval.rb +57 -0
- data/lib/adarwin/memorycopies.rb +153 -0
- data/lib/adarwin/nest.rb +225 -0
- data/lib/adarwin/preprocessor.rb +76 -0
- data/lib/adarwin/reference.rb +261 -0
- data/lib/bones.rb +4 -55
- data/lib/bones/algorithm.rb +77 -40
- data/lib/bones/copy.rb +26 -0
- data/lib/bones/engine.rb +147 -31
- data/lib/bones/preprocessor.rb +92 -12
- data/lib/bones/species.rb +4 -3
- data/lib/bones/structure.rb +14 -4
- data/lib/castaddon.rb +11 -6
- data/lib/castaddon/node_adarwin.rb +245 -0
- data/lib/castaddon/node_bones.rb +316 -0
- data/lib/castaddon/node_common.rb +289 -0
- data/lib/castaddon/transformations.rb +236 -0
- data/lib/common.rb +216 -0
- data/skeletons/CPU-C/common/header.c +3 -0
- data/skeletons/CPU-C/common/mem_global.c +0 -0
- data/skeletons/CPU-C/common/timer_2_start.c +11 -13
- data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
- data/skeletons/CPU-C/common/timer_globals.c +29 -0
- data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
- data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
- data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
- data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
- data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
- data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
- data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
- data/skeletons/CPU-OPENMP/common/globals.c +1 -0
- data/skeletons/CPU-OPENMP/common/header.c +3 -0
- data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
- data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
- data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
- data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
- data/skeletons/GPU-CUDA/common/globals.c +27 -3
- data/skeletons/GPU-CUDA/common/header.c +2 -0
- data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
- data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
- data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
- data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
- data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
- data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
- data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
- data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
- data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
- data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
- data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
- data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
- data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
- data/skeletons/GPU-CUDA/skeletons.txt +6 -5
- data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
- data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
- data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
- data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
- data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
- data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
- data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
- data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
- data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
- data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
- data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
- data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
- data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
- data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
- data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
- data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
- data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
- data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
- data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
- data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
- data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
- data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
- data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
- data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
- data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
- data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
- data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
- data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
- data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
- data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
- data/test/examples/chunk/example01_species.c +58 -0
- data/test/examples/chunk/example02_species.c +48 -0
- data/test/examples/chunk/example03_species.c +63 -0
- data/test/examples/chunk/example04_species.c +58 -0
- data/test/examples/chunk/example05_species.c +56 -0
- data/test/examples/chunk/example06_species.c +49 -0
- data/test/examples/chunk/example07_species.c +53 -0
- data/test/examples/dependences/example01_species.c +46 -0
- data/test/examples/dependences/example02_species.c +44 -0
- data/test/examples/dependences/example03_species.c +47 -0
- data/test/examples/dependences/example04_species.c +48 -0
- data/test/examples/dependences/example05_species.c +46 -0
- data/test/examples/element/example01_species.c +50 -0
- data/test/examples/element/example02_species.c +50 -0
- data/test/examples/element/example03_species.c +62 -0
- data/test/examples/element/example04_species.c +53 -0
- data/test/examples/element/example05_species.c +59 -0
- data/test/examples/element/example06_species.c +50 -0
- data/test/examples/element/example07_species.c +58 -0
- data/test/examples/element/example08_species.c +49 -0
- data/test/examples/element/example09_species.c +52 -0
- data/test/examples/element/example10_species.c +54 -0
- data/test/examples/element/example11_species.c +51 -0
- data/test/examples/element/example12_species.c +60 -0
- data/test/examples/element/example13_species.c +77 -0
- data/test/examples/neighbourhood/example01_species.c +57 -0
- data/test/examples/neighbourhood/example02_species.c +56 -0
- data/test/examples/neighbourhood/example03_species.c +83 -0
- data/test/examples/neighbourhood/example04_species.c +55 -0
- data/test/examples/neighbourhood/example05_species.c +48 -0
- data/test/examples/shared/example01_species.c +49 -0
- data/test/examples/shared/example02_species.c +55 -0
- data/test/examples/shared/example03_species.c +59 -0
- data/test/examples/shared/example04_species.c +56 -0
- data/test/examples/shared/example05_species.c +52 -0
- metadata +193 -73
- data/examples/benchmarks/overview.txt +0 -38
- data/lib/castaddon/node.rb +0 -753
@@ -1,24 +1,42 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
|
2
|
+
////////////////////////////////////////
|
3
|
+
//////////// Globals ///////////////////
|
4
|
+
////////////////////////////////////////
|
3
5
|
|
4
6
|
#define BONES_MIN(a,b) ((a<b) ? a : b)
|
5
7
|
#define BONES_MAX(a,b) ((a>b) ? a : b)
|
6
8
|
#define DIV_CEIL(a,b) ((a+b-1)/b)
|
7
9
|
#define DIV_FLOOR(a,b) (a/b)
|
8
10
|
|
9
|
-
//
|
11
|
+
// CUDA timers
|
12
|
+
cudaEvent_t bones_start2;
|
13
|
+
cudaEvent_t bones_stop2;
|
14
|
+
|
15
|
+
// Function to initialize the GPU (for fair measurements, streams, timers)
|
10
16
|
void bones_initialize_target(void) {
|
11
17
|
int* bones_temporary = 0;
|
12
18
|
cudaMalloc((void**)&bones_temporary, sizeof(int));
|
13
19
|
cudaFree(bones_temporary);
|
20
|
+
cudaStreamCreate(&kernel_stream);
|
21
|
+
cudaEventCreate(&bones_start2);
|
22
|
+
cudaEventCreate(&bones_stop2);
|
14
23
|
}
|
15
24
|
|
16
25
|
// Declaration of the original function
|
17
26
|
int bones_main(void);
|
18
27
|
|
28
|
+
////////////////////////////////////////
|
29
|
+
//////////// Main function /////////////
|
30
|
+
////////////////////////////////////////
|
31
|
+
|
19
32
|
// New main function for initialisation and clean-up
|
20
33
|
int main(void) {
|
21
34
|
|
35
|
+
// Initialisation of the scheduler
|
36
|
+
bones_initialize_scheduler();
|
37
|
+
pthread_t bones_scheduler_thread;
|
38
|
+
pthread_create(&bones_scheduler_thread, NULL, bones_scheduler, NULL);
|
39
|
+
|
22
40
|
// Initialisation of the target
|
23
41
|
bones_initialize_target();
|
24
42
|
|
@@ -26,6 +44,12 @@ int main(void) {
|
|
26
44
|
int bones_return = bones_main();
|
27
45
|
|
28
46
|
// Clean-up
|
47
|
+
bones_scheduler_done = 1;
|
48
|
+
pthread_join(bones_scheduler_thread, NULL);
|
49
|
+
cudaStreamDestroy(kernel_stream);
|
29
50
|
return bones_return;
|
30
51
|
}
|
31
52
|
|
53
|
+
////////////////////////////////////////
|
54
|
+
////////// Accelerated functions ///////
|
55
|
+
////////////////////////////////////////
|
@@ -0,0 +1,6 @@
|
|
1
|
+
|
2
|
+
// Copy <array> from device to host
|
3
|
+
void bones_copy<direction>_<id>_<array>(<definition>) {
|
4
|
+
cudaStreamSynchronize(kernel_stream);
|
5
|
+
bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <index>);
|
6
|
+
}
|
@@ -1,3 +1,4 @@
|
|
1
1
|
|
2
2
|
// Copy <array> from device to host
|
3
|
-
|
3
|
+
bones_memcpy(<array><flatten>+<offset>, device_<array>+<offset>, <variable_dimensions>*sizeof(<type>), cudaMemcpyDeviceToHost, <state>, <state>);
|
4
|
+
bones_synchronize(<state>);
|
@@ -1,3 +1,4 @@
|
|
1
1
|
|
2
2
|
// Copy <array> to the device
|
3
|
-
|
3
|
+
bones_memcpy(device_<array>, <array><flatten>, <variable_dimensions>*sizeof(<type>), cudaMemcpyHostToDevice, <state>, <state>);
|
4
|
+
bones_synchronize(<state>);
|
@@ -0,0 +1 @@
|
|
1
|
+
<type>* device_<array>;
|
@@ -1,5 +1,4 @@
|
|
1
1
|
|
2
2
|
// Create space for <array> on the device
|
3
|
-
<type>* device_<array> = 0;
|
4
3
|
cudaMalloc((void**)&device_<array>, <variable_dimensions>*sizeof(<type>));
|
5
|
-
cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
|
4
|
+
//cudaMemset((void*)device_<array>, 0, <variable_dimensions>*sizeof(<type>));
|
@@ -0,0 +1,86 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <pthread.h>
|
4
|
+
|
5
|
+
////////////////////////////////////////
|
6
|
+
////////// Thread scheduler ////////////
|
7
|
+
////////////////////////////////////////
|
8
|
+
|
9
|
+
// Memory copy and kernel streams
|
10
|
+
cudaStream_t kernel_stream;
|
11
|
+
cudaStream_t memory_stream;
|
12
|
+
|
13
|
+
// Task structure
|
14
|
+
typedef struct {
|
15
|
+
void *dst;
|
16
|
+
void *src;
|
17
|
+
int size;
|
18
|
+
enum cudaMemcpyKind direction;
|
19
|
+
int deadline;
|
20
|
+
volatile int status;
|
21
|
+
} Task;
|
22
|
+
|
23
|
+
// Task list
|
24
|
+
#define BONES_MAX_TASKS 100
|
25
|
+
Task tasks[BONES_MAX_TASKS];
|
26
|
+
|
27
|
+
// Scheduler status
|
28
|
+
volatile int bones_scheduler_done;
|
29
|
+
|
30
|
+
// Create synchronisation points
|
31
|
+
void bones_synchronize(int deadline) {
|
32
|
+
cudaStreamSynchronize(kernel_stream);
|
33
|
+
printf("Reached: syncpoint %d [worker]\n",deadline); fflush(stdout);
|
34
|
+
for (int t = 0; t <= BONES_MAX_TASKS; t++) {
|
35
|
+
if (tasks[t].deadline == deadline && tasks[t].status == 1) {
|
36
|
+
while(tasks[t].status != 2) { }
|
37
|
+
}
|
38
|
+
}
|
39
|
+
printf("Reached: syncpoint %d [all]\n",deadline); fflush(stdout);
|
40
|
+
}
|
41
|
+
|
42
|
+
// Add a new task
|
43
|
+
void bones_memcpy(void *dst, void *src, int size, enum cudaMemcpyKind direction, int deadline, int task_id) {
|
44
|
+
Task new_task = { .dst = dst, .src = src, .size = size, .direction = direction, .deadline = deadline, .status = 1 };
|
45
|
+
tasks[task_id] = new_task;
|
46
|
+
}
|
47
|
+
|
48
|
+
// Perform a task (CUDA memory copy)
|
49
|
+
void bones_scheduler_copy(Task current_task) {
|
50
|
+
usleep(400);
|
51
|
+
cudaMemcpyAsync(current_task.dst, current_task.src, current_task.size, current_task.direction, memory_stream);
|
52
|
+
cudaStreamSynchronize(memory_stream);
|
53
|
+
}
|
54
|
+
|
55
|
+
// Initialize the scheduler
|
56
|
+
void bones_initialize_scheduler(void) {
|
57
|
+
bones_scheduler_done = 0;
|
58
|
+
}
|
59
|
+
|
60
|
+
// The scheduler (infinite loop)
|
61
|
+
#define LARGE_INT 1000
|
62
|
+
void* bones_scheduler(void* ptr) {
|
63
|
+
cudaStreamCreate(&memory_stream);
|
64
|
+
while (bones_scheduler_done != 1) {
|
65
|
+
|
66
|
+
// Find the ready task with the earliest deadline
|
67
|
+
int found_deadline = LARGE_INT;
|
68
|
+
int found_task = LARGE_INT;
|
69
|
+
for (int t = 0; t <= BONES_MAX_TASKS; t++) {
|
70
|
+
if (tasks[t].status == 1) {
|
71
|
+
if (tasks[t].deadline < found_deadline) {
|
72
|
+
found_task = t;
|
73
|
+
found_deadline = tasks[t].deadline;
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
// Perform the found task
|
79
|
+
if (found_task != LARGE_INT) {
|
80
|
+
printf("Performing task %d, dl %d [scheduler]\n",found_task,tasks[found_task].deadline);
|
81
|
+
bones_scheduler_copy(tasks[found_task]);
|
82
|
+
tasks[found_task].status = 2;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
cudaStreamDestroy(memory_stream);
|
86
|
+
}
|
@@ -1,6 +1,4 @@
|
|
1
1
|
|
2
2
|
// Start the timer for the measurement of the kernel execution time
|
3
|
-
|
4
|
-
|
5
|
-
cudaEventCreate(&bones_start2);
|
6
|
-
cudaEventRecord(bones_start2,0);
|
3
|
+
//cudaStreamSynchronize(kernel_stream);
|
4
|
+
cudaEventRecord(bones_start2,kernel_stream);
|
@@ -1,10 +1,8 @@
|
|
1
1
|
|
2
2
|
// Stop the timer for the measurement of the kernel execution time
|
3
|
-
|
4
|
-
|
5
|
-
cudaEventCreate(&bones_stop2);
|
6
|
-
cudaEventRecord(bones_stop2,0);
|
3
|
+
//cudaStreamSynchronize(kernel_stream);
|
4
|
+
cudaEventRecord(bones_stop2,kernel_stream);
|
7
5
|
cudaEventSynchronize(bones_stop2);
|
8
6
|
float bones_timer2 = 0;
|
9
7
|
cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
|
10
|
-
printf(">>>\t\t
|
8
|
+
printf(">>>\t\t Execution time [kernel <algorithm_basename>]: %.3lf ms \n", bones_timer2);
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
////////////////////////////////////////
|
3
|
+
//////////// Timers ////////////////////
|
4
|
+
////////////////////////////////////////
|
5
|
+
|
6
|
+
// Timer
|
7
|
+
cudaEvent_t bones_start1;
|
8
|
+
|
9
|
+
// Start the timer for the measurement of the whole scop
|
10
|
+
void bones_timer_start() {
|
11
|
+
cudaDeviceSynchronize();
|
12
|
+
cudaEventCreate(&bones_start1);
|
13
|
+
cudaEventRecord(bones_start1,kernel_stream);
|
14
|
+
}
|
15
|
+
|
16
|
+
// End the timer for the measurement of the whole scop
|
17
|
+
void bones_timer_stop() {
|
18
|
+
cudaDeviceSynchronize();
|
19
|
+
cudaEvent_t bones_stop1;
|
20
|
+
cudaEventCreate(&bones_stop1);
|
21
|
+
cudaEventRecord(bones_stop1,kernel_stream);
|
22
|
+
cudaEventSynchronize(bones_stop1);
|
23
|
+
float bones_timer1 = 0;
|
24
|
+
cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
|
25
|
+
printf(">>>\t\t Execution time [full scop]: %.3lf ms \n", bones_timer1);
|
26
|
+
}
|
@@ -72,11 +72,9 @@ __global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <i
|
|
72
72
|
// Function to start the kernel
|
73
73
|
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
74
74
|
int bones_block_size;
|
75
|
-
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
76
|
-
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
77
|
-
else
|
78
|
-
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
79
|
-
else { bones_block_size = 32; }
|
75
|
+
if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
|
76
|
+
else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
|
77
|
+
else { bones_block_size = 128; }
|
80
78
|
|
81
79
|
// First perform some pre-shuffling (for the first input)
|
82
80
|
<in0_type>* shuffled_<in0_name> = 0;
|
@@ -86,7 +84,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
|
|
86
84
|
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
87
85
|
<in0_type>* temp_<in0_name> = <in0_name>;
|
88
86
|
<in0_name> = shuffled_<in0_name>;
|
89
|
-
cudaFree(temp_<in0_name>);
|
87
|
+
//cudaFree(temp_<in0_name>);
|
90
88
|
|
91
89
|
// First perform some pre-shuffling (for the second input)
|
92
90
|
<in0_type>* shuffled_<in1_name> = 0;
|
@@ -96,7 +94,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
|
|
96
94
|
bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
|
97
95
|
<in1_type>* temp_<in1_name> = <in1_name>;
|
98
96
|
<in1_name> = shuffled_<in1_name>;
|
99
|
-
cudaFree(temp_<in1_name>);
|
97
|
+
//cudaFree(temp_<in1_name>);
|
100
98
|
|
101
99
|
// Then run the original kernel
|
102
100
|
dim3 bones_threads0(bones_block_size);
|
@@ -46,11 +46,9 @@ __global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <i
|
|
46
46
|
// Function to start the kernel
|
47
47
|
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
48
48
|
int bones_block_size;
|
49
|
-
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
50
|
-
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
51
|
-
else
|
52
|
-
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
53
|
-
else { bones_block_size = 32; }
|
49
|
+
if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
|
50
|
+
else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
|
51
|
+
else { bones_block_size = 128; }
|
54
52
|
|
55
53
|
// First perform some pre-shuffling
|
56
54
|
<in0_type>* shuffled_<in0_name> = 0;
|
@@ -60,7 +58,7 @@ extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argumen
|
|
60
58
|
bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
|
61
59
|
<in0_type>* temp_<in0_name> = <in0_name>;
|
62
60
|
<in0_name> = shuffled_<in0_name>;
|
63
|
-
cudaFree(temp_<in0_name>);
|
61
|
+
//cudaFree(temp_<in0_name>);
|
64
62
|
|
65
63
|
// Then run the original kernel
|
66
64
|
dim3 bones_threads0(bones_block_size);
|
@@ -1,5 +1,5 @@
|
|
1
1
|
/* STARTDEF
|
2
|
-
void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
|
2
|
+
void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>);
|
3
3
|
ENDDEF */
|
4
4
|
// Start of the <algorithm_name> kernel
|
5
5
|
__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
@@ -15,14 +15,12 @@ __global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_d
|
|
15
15
|
}
|
16
16
|
|
17
17
|
// Function to start the kernel
|
18
|
-
extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
|
18
|
+
extern "C" void bones_prekernel_<algorithm_name>_0(cudaStream_t kernel_stream, <devicedefinitions>, <argument_definition>) {
|
19
19
|
int bones_block_size;
|
20
|
-
if (<parallelism> >= 64*512) { bones_block_size = 512;}
|
21
|
-
else if (<parallelism> >= 64*256) { bones_block_size = 256;}
|
22
|
-
else
|
23
|
-
else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
|
24
|
-
else { bones_block_size = 32; }
|
20
|
+
if (<parallelism> >= 64*512 ) { bones_block_size = 512; }
|
21
|
+
else if (<parallelism> >= 64*256 ) { bones_block_size = 256; }
|
22
|
+
else { bones_block_size = 128; }
|
25
23
|
dim3 bones_threads(bones_block_size);
|
26
24
|
dim3 bones_grid(DIV_CEIL(<parallelism>,bones_block_size));
|
27
|
-
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<names>, <argument_name>);
|
25
|
+
bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads, 0, kernel_stream >>>(<names>, <argument_name>);
|
28
26
|
}
|
@@ -19,12 +19,13 @@ N,N|chunk(D)+ ^ N,N|element+ -> N,N|element+ :defa
|
|
19
19
|
D|chunk(D)+ -> D|element+ :default :00
|
20
20
|
D|chunk(D)+ ^ D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
21
21
|
D|chunk(D)+ ^ D|element+ -> D|element+ :default :00
|
22
|
-
|
23
|
-
D|neighbourhood(D)+
|
24
|
-
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :00
|
22
|
+
D|neighbourhood(D)+ -> D|element+ :default :40
|
23
|
+
D|neighbourhood(D)+ ^ D|element+ -> D|element+ :default :40
|
25
24
|
D|element+ -> D|chunk(D)+ :default :00
|
26
|
-
D|element+ -> D|element+ :default :
|
25
|
+
D|element+ -> D|element+ :default :40
|
27
26
|
D|element -> 1|shared :D-element-to-1-shared :02 03 04 05
|
28
27
|
D|element+ -> D|shared+ :default :08
|
29
28
|
D|element+ -> D|element+ ^ D|shared+ :default :08
|
30
|
-
D|void -> D|element+ :default :
|
29
|
+
D|void -> D|element+ :default :40
|
30
|
+
|
31
|
+
N|neighbourhood(N)+ -> N|element+ :N-neighbourhood-N-to-N-element :10
|
@@ -42,26 +42,30 @@ int main(void) {
|
|
42
42
|
for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
|
43
43
|
|
44
44
|
// Perform the computation (E := alpha*A*B*C + beta*D)
|
45
|
-
#pragma
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
for (
|
50
|
-
tmp[i][j]
|
45
|
+
#pragma scop
|
46
|
+
{
|
47
|
+
#pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> tmp[0:NI-1,0:NJ-1]|element
|
48
|
+
for (i = 0; i < NI; i++) {
|
49
|
+
for (j = 0; j < NJ; j++) {
|
50
|
+
tmp[i][j] = 0;
|
51
|
+
for (k = 0; k < NK; k++) {
|
52
|
+
tmp[i][j] += alpha * A[i][k] * B[k][j];
|
53
|
+
}
|
51
54
|
}
|
52
55
|
}
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
56
|
+
#pragma species endkernel 2mm_k1
|
57
|
+
#pragma species kernel D[0:NI-1,0:NL-1]|element ^ tmp[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ C[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> D[0:NI-1,0:NL-1]|element
|
58
|
+
for (i = 0; i < NI; i++) {
|
59
|
+
for (j = 0; j < NL; j++) {
|
60
|
+
D[i][j] *= beta;
|
61
|
+
for (k = 0; k < NJ; k++) {
|
62
|
+
D[i][j] += tmp[i][k] * C[k][j];
|
63
|
+
}
|
61
64
|
}
|
62
65
|
}
|
66
|
+
#pragma species endkernel 2mm_k2
|
63
67
|
}
|
64
|
-
#pragma
|
68
|
+
#pragma endscop
|
65
69
|
|
66
70
|
// Clean-up and exit the function
|
67
71
|
fflush(stdout);
|
@@ -0,0 +1,82 @@
|
|
1
|
+
//
|
2
|
+
// This file is part of the Bones source-to-source compiler examples. The C-code
|
3
|
+
// is largely identical in terms of functionality and variable naming to the code
|
4
|
+
// found in PolyBench/C version 3.2. For more information on PolyBench/C or Bones
|
5
|
+
// please use the contact information below.
|
6
|
+
//
|
7
|
+
// == More information on PolyBench/C
|
8
|
+
// Contact............Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
9
|
+
// Web address........http://polybench.sourceforge.net/
|
10
|
+
//
|
11
|
+
// == More information on Bones
|
12
|
+
// Contact............Cedric Nugteren <c.nugteren@tue.nl>
|
13
|
+
// Web address........http://parse.ele.tue.nl/bones/
|
14
|
+
//
|
15
|
+
// == File information
|
16
|
+
// Filename...........benchmark/3mm.c
|
17
|
+
// Author.............Cedric Nugteren
|
18
|
+
// Last modified on...03-April-2012
|
19
|
+
//
|
20
|
+
|
21
|
+
#include "common.h"
|
22
|
+
|
23
|
+
// This is '3mm', a 3 matrix multiply kernel
|
24
|
+
int main(void) {
|
25
|
+
int i,j,k;
|
26
|
+
|
27
|
+
// Declare arrays on the stack
|
28
|
+
float A[NI][NK];
|
29
|
+
float B[NK][NJ];
|
30
|
+
float C[NJ][NM];
|
31
|
+
float D[NM][NL];
|
32
|
+
float E[NI][NJ];
|
33
|
+
float F[NJ][NL];
|
34
|
+
float G[NI][NL];
|
35
|
+
|
36
|
+
// Set the input data
|
37
|
+
for (i=0; i<NI; i++) { for (j=0; j<NK; j++) { A[i][j] = ((float) i*j) / NI; } }
|
38
|
+
for (i=0; i<NK; i++) { for (j=0; j<NJ; j++) { B[i][j] = ((float) i*(j+1)) / NJ; } }
|
39
|
+
for (i=0; i<NL; i++) { for (j=0; j<NJ; j++) { C[i][j] = ((float) i*(j+3)) / NL; } }
|
40
|
+
for (i=0; i<NI; i++) { for (j=0; j<NL; j++) { D[i][j] = ((float) i*(j+2)) / NK; } }
|
41
|
+
|
42
|
+
// Perform the computation (G := E*F, with E := A*B and F := C*D)
|
43
|
+
#pragma scop
|
44
|
+
{
|
45
|
+
#pragma species kernel A[0:NI-1,0:NK-1]|chunk(0:0,0:NK-1) ^ B[0:NK-1,0:NJ-1]|chunk(0:NK-1,0:0) -> E[0:NI-1,0:NJ-1]|element
|
46
|
+
for (i = 0; i < NI; i++) {
|
47
|
+
for (j = 0; j < NJ; j++) {
|
48
|
+
E[i][j] = 0;
|
49
|
+
for (k = 0; k < NK; k++) {
|
50
|
+
E[i][j] += A[i][k] * B[k][j];
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
#pragma species endkernel 3mm_k1
|
55
|
+
#pragma species kernel C[0:NJ-1,0:NM-1]|chunk(0:0,0:NM-1) ^ D[0:NM-1,0:NL-1]|chunk(0:NM-1,0:0) -> F[0:NJ-1,0:NL-1]|element
|
56
|
+
for (i = 0; i < NJ; i++) {
|
57
|
+
for (j = 0; j < NL; j++) {
|
58
|
+
F[i][j] = 0;
|
59
|
+
for (k = 0; k < NM; k++) {
|
60
|
+
F[i][j] += C[i][k] * D[k][j];
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
#pragma species endkernel 3mm_k2
|
65
|
+
#pragma species kernel E[0:NI-1,0:NJ-1]|chunk(0:0,0:NJ-1) ^ F[0:NJ-1,0:NL-1]|chunk(0:NJ-1,0:0) -> G[0:NI-1,0:NL-1]|element
|
66
|
+
for (i = 0; i < NI; i++) {
|
67
|
+
for (j = 0; j < NL; j++) {
|
68
|
+
G[i][j] = 0;
|
69
|
+
for (k = 0; k < NJ; k++) {
|
70
|
+
G[i][j] += E[i][k] * F[k][j];
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
74
|
+
#pragma species endkernel 3mm_k3
|
75
|
+
}
|
76
|
+
#pragma endscop
|
77
|
+
|
78
|
+
// Clean-up and exit the function
|
79
|
+
fflush(stdout);
|
80
|
+
return 0;
|
81
|
+
}
|
82
|
+
|