RubyGems - bones-compiler - Versions diffs - 1.1.0 - Mend

bones-compiler 1.1.0

Files changed (203) hide show

data/CHANGELOG +117 -0
data/LICENSE +9 -0
data/README.rdoc +126 -0
data/Rakefile +107 -0
data/VERSION +1 -0
data/bin/bones +20 -0
data/examples/applications/ffos.c +552 -0
data/examples/benchmarks/2mm.c +70 -0
data/examples/benchmarks/3mm.c +81 -0
data/examples/benchmarks/adi.c +81 -0
data/examples/benchmarks/atax.c +65 -0
data/examples/benchmarks/bicg.c +67 -0
data/examples/benchmarks/cholesky.c +64 -0
data/examples/benchmarks/common.h +168 -0
data/examples/benchmarks/correlation.c +97 -0
data/examples/benchmarks/covariance.c +77 -0
data/examples/benchmarks/doitgen.c +63 -0
data/examples/benchmarks/durbin.c +76 -0
data/examples/benchmarks/dynprog.c +67 -0
data/examples/benchmarks/fdtd-2d-apml.c +114 -0
data/examples/benchmarks/fdtd-2d.c +74 -0
data/examples/benchmarks/floyd-warshall.c +50 -0
data/examples/benchmarks/gemm.c +69 -0
data/examples/benchmarks/gemver.c +89 -0
data/examples/benchmarks/gesummv.c +64 -0
data/examples/benchmarks/gramschmidt.c +84 -0
data/examples/benchmarks/jacobi-1d-imper.c +55 -0
data/examples/benchmarks/jacobi-2d-imper.c +61 -0
data/examples/benchmarks/lu.c +57 -0
data/examples/benchmarks/ludcmp.c +91 -0
data/examples/benchmarks/mvt.c +65 -0
data/examples/benchmarks/overview.txt +38 -0
data/examples/benchmarks/reg_detect.c +82 -0
data/examples/benchmarks/saxpy.c +45 -0
data/examples/benchmarks/seidel-2d.c +51 -0
data/examples/benchmarks/symm.c +74 -0
data/examples/benchmarks/syr2k.c +65 -0
data/examples/benchmarks/syrk.c +62 -0
data/examples/benchmarks/trisolv.c +57 -0
data/examples/benchmarks/trmm.c +57 -0
data/examples/chunk/example1.c +54 -0
data/examples/chunk/example2.c +44 -0
data/examples/chunk/example3.c +59 -0
data/examples/chunk/example4.c +55 -0
data/examples/chunk/example5.c +52 -0
data/examples/element/example1.c +46 -0
data/examples/element/example10.c +50 -0
data/examples/element/example11.c +47 -0
data/examples/element/example12.c +56 -0
data/examples/element/example2.c +46 -0
data/examples/element/example3.c +58 -0
data/examples/element/example4.c +49 -0
data/examples/element/example5.c +56 -0
data/examples/element/example6.c +46 -0
data/examples/element/example7.c +54 -0
data/examples/element/example8.c +45 -0
data/examples/element/example9.c +48 -0
data/examples/neighbourhood/example1.c +54 -0
data/examples/neighbourhood/example2.c +55 -0
data/examples/neighbourhood/example3.c +82 -0
data/examples/neighbourhood/example4.c +52 -0
data/examples/shared/example1.c +45 -0
data/examples/shared/example2.c +51 -0
data/examples/shared/example3.c +55 -0
data/examples/shared/example4.c +52 -0
data/examples/shared/example5.c +48 -0
data/lib/bones.rb +266 -0
data/lib/bones/algorithm.rb +541 -0
data/lib/bones/engine.rb +386 -0
data/lib/bones/preprocessor.rb +161 -0
data/lib/bones/species.rb +196 -0
data/lib/bones/structure.rb +94 -0
data/lib/bones/variable.rb +169 -0
data/lib/bones/variablelist.rb +72 -0
data/lib/castaddon.rb +27 -0
data/lib/castaddon/index.rb +40 -0
data/lib/castaddon/node.rb +753 -0
data/lib/castaddon/type.rb +37 -0
data/skeletons/CPU-C/common/epilogue.c +0 -0
data/skeletons/CPU-C/common/globals.c +17 -0
data/skeletons/CPU-C/common/globals_kernel.c +1 -0
data/skeletons/CPU-C/common/header.c +0 -0
data/skeletons/CPU-C/common/mem_copy_D2H.c +0 -0
data/skeletons/CPU-C/common/mem_copy_H2D.c +0 -0
data/skeletons/CPU-C/common/mem_epilogue.c +0 -0
data/skeletons/CPU-C/common/mem_prologue.c +3 -0
data/skeletons/CPU-C/common/prologue.c +0 -0
data/skeletons/CPU-C/common/timer_1_start.c +0 -0
data/skeletons/CPU-C/common/timer_1_stop.c +0 -0
data/skeletons/CPU-C/common/timer_2_start.c +20 -0
data/skeletons/CPU-C/common/timer_2_stop.c +8 -0
data/skeletons/CPU-C/kernel/default.host.c +3 -0
data/skeletons/CPU-C/kernel/default.kernel.c +15 -0
data/skeletons/CPU-C/skeletons.txt +24 -0
data/skeletons/CPU-OPENCL-AMD/common/epilogue.c +6 -0
data/skeletons/CPU-OPENCL-AMD/common/globals.c +155 -0
data/skeletons/CPU-OPENCL-AMD/common/globals_kernel.c +4 -0
data/skeletons/CPU-OPENCL-AMD/common/header.c +0 -0
data/skeletons/CPU-OPENCL-AMD/common/mem_copy_D2H.c +8 -0
data/skeletons/CPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
data/skeletons/CPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
data/skeletons/CPU-OPENCL-AMD/common/mem_prologue.c +6 -0
data/skeletons/CPU-OPENCL-AMD/common/prologue.c +24 -0
data/skeletons/CPU-OPENCL-AMD/common/timer_1_start.c +5 -0
data/skeletons/CPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
data/skeletons/CPU-OPENCL-AMD/common/timer_2_start.c +16 -0
data/skeletons/CPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
data/skeletons/CPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
data/skeletons/CPU-OPENCL-AMD/kernel/default.host.c +14 -0
data/skeletons/CPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
data/skeletons/CPU-OPENCL-AMD/skeletons.txt +26 -0
data/skeletons/CPU-OPENCL-INTEL/common/epilogue.c +3 -0
data/skeletons/CPU-OPENCL-INTEL/common/globals.c +154 -0
data/skeletons/CPU-OPENCL-INTEL/common/globals_kernel.c +4 -0
data/skeletons/CPU-OPENCL-INTEL/common/header.c +31 -0
data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +5 -0
data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +3 -0
data/skeletons/CPU-OPENCL-INTEL/common/mem_epilogue.c +3 -0
data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +4 -0
data/skeletons/CPU-OPENCL-INTEL/common/prologue.c +24 -0
data/skeletons/CPU-OPENCL-INTEL/common/timer_1_start.c +5 -0
data/skeletons/CPU-OPENCL-INTEL/common/timer_1_stop.c +9 -0
data/skeletons/CPU-OPENCL-INTEL/common/timer_2_start.c +16 -0
data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +11 -0
data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.host.c +67 -0
data/skeletons/CPU-OPENCL-INTEL/kernel/D-element-to-1-shared.kernel.cl +72 -0
data/skeletons/CPU-OPENCL-INTEL/kernel/default.host.c +14 -0
data/skeletons/CPU-OPENCL-INTEL/kernel/default.kernel.cl +13 -0
data/skeletons/CPU-OPENCL-INTEL/skeletons.txt +26 -0
data/skeletons/CPU-OPENMP/common/epilogue.c +0 -0
data/skeletons/CPU-OPENMP/common/globals.c +37 -0
data/skeletons/CPU-OPENMP/common/globals_kernel.c +6 -0
data/skeletons/CPU-OPENMP/common/header.c +0 -0
data/skeletons/CPU-OPENMP/common/mem_copy_D2H.c +0 -0
data/skeletons/CPU-OPENMP/common/mem_copy_H2D.c +0 -0
data/skeletons/CPU-OPENMP/common/mem_epilogue.c +0 -0
data/skeletons/CPU-OPENMP/common/mem_prologue.c +3 -0
data/skeletons/CPU-OPENMP/common/prologue.c +0 -0
data/skeletons/CPU-OPENMP/common/timer_1_start.c +12 -0
data/skeletons/CPU-OPENMP/common/timer_1_stop.c +0 -0
data/skeletons/CPU-OPENMP/common/timer_2_start.c +18 -0
data/skeletons/CPU-OPENMP/common/timer_2_stop.c +8 -0
data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.host.c +27 -0
data/skeletons/CPU-OPENMP/kernel/D-element-to-1-shared.kernel.c +46 -0
data/skeletons/CPU-OPENMP/kernel/default.host.c +11 -0
data/skeletons/CPU-OPENMP/kernel/default.kernel.c +18 -0
data/skeletons/CPU-OPENMP/skeletons.txt +26 -0
data/skeletons/GPU-CUDA/common/epilogue.c +0 -0
data/skeletons/GPU-CUDA/common/globals.c +31 -0
data/skeletons/GPU-CUDA/common/globals_kernel.c +4 -0
data/skeletons/GPU-CUDA/common/header.c +0 -0
data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +3 -0
data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +3 -0
data/skeletons/GPU-CUDA/common/mem_epilogue.c +3 -0
data/skeletons/GPU-CUDA/common/mem_prologue.c +5 -0
data/skeletons/GPU-CUDA/common/prologue.c +6 -0
data/skeletons/GPU-CUDA/common/timer_1_start.c +6 -0
data/skeletons/GPU-CUDA/common/timer_1_stop.c +10 -0
data/skeletons/GPU-CUDA/common/timer_2_start.c +6 -0
data/skeletons/GPU-CUDA/common/timer_2_stop.c +10 -0
data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c +3 -0
data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +105 -0
data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c +3 -0
data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu +119 -0
data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c +3 -0
data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu +166 -0
data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.host.c +3 -0
data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +69 -0
data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.host.c +3 -0
data/skeletons/GPU-CUDA/kernel/N-neighbourhood-N-to-N-element.kernel.cu +42 -0
data/skeletons/GPU-CUDA/kernel/default.host.c +3 -0
data/skeletons/GPU-CUDA/kernel/default.kernel.cu +28 -0
data/skeletons/GPU-CUDA/skeletons.txt +30 -0
data/skeletons/GPU-OPENCL-AMD/common/epilogue.c +3 -0
data/skeletons/GPU-OPENCL-AMD/common/globals.c +155 -0
data/skeletons/GPU-OPENCL-AMD/common/globals_kernel.c +4 -0
data/skeletons/GPU-OPENCL-AMD/common/header.c +0 -0
data/skeletons/GPU-OPENCL-AMD/common/mem_copy_D2H.c +4 -0
data/skeletons/GPU-OPENCL-AMD/common/mem_copy_H2D.c +4 -0
data/skeletons/GPU-OPENCL-AMD/common/mem_epilogue.c +3 -0
data/skeletons/GPU-OPENCL-AMD/common/mem_prologue.c +3 -0
data/skeletons/GPU-OPENCL-AMD/common/prologue.c +24 -0
data/skeletons/GPU-OPENCL-AMD/common/timer_1_start.c +5 -0
data/skeletons/GPU-OPENCL-AMD/common/timer_1_stop.c +9 -0
data/skeletons/GPU-OPENCL-AMD/common/timer_2_start.c +4 -0
data/skeletons/GPU-OPENCL-AMD/common/timer_2_stop.c +11 -0
data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.host.c +67 -0
data/skeletons/GPU-OPENCL-AMD/kernel/D-element-to-1-shared.kernel.cl +72 -0
data/skeletons/GPU-OPENCL-AMD/kernel/default.host.c +14 -0
data/skeletons/GPU-OPENCL-AMD/kernel/default.kernel.cl +13 -0
data/skeletons/GPU-OPENCL-AMD/skeletons.txt +26 -0
data/skeletons/verification/header.c +2 -0
data/skeletons/verification/timer_start.c +4 -0
data/skeletons/verification/timer_stop.c +6 -0
data/skeletons/verification/verify_results.c +23 -0
data/test/bones/test_algorithm.rb +40 -0
data/test/bones/test_common.rb +54 -0
data/test/bones/test_preprocessor.rb +46 -0
data/test/bones/test_species.rb +21 -0
data/test/bones/test_variable.rb +84 -0
data/test/test_helper.rb +106 -0
metadata +303 -0

data/skeletons/GPU-CUDA/common/prologue.c ADDED Viewed

@@ -0,0 +1,6 @@
+  // Set the cache size to maximal
+  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+  // Stop execution directly if there is no work to do
+  if (<parallelism> <= 0) { return; }

data/skeletons/GPU-CUDA/common/timer_1_start.c ADDED Viewed

@@ -0,0 +1,6 @@
+  // Start the timer for the measurement of the kernel and memory copy execution time
+  cudaThreadSynchronize();
+  cudaEvent_t bones_start1;
+  cudaEventCreate(&bones_start1);
+  cudaEventRecord(bones_start1,0);

data/skeletons/GPU-CUDA/common/timer_1_stop.c ADDED Viewed

@@ -0,0 +1,10 @@
+  // End the timer for the measurement of the kernel and memory copy execution time
+  cudaThreadSynchronize();
+  cudaEvent_t bones_stop1;
+  cudaEventCreate(&bones_stop1);
+  cudaEventRecord(bones_stop1,0);
+  cudaEventSynchronize(bones_stop1);
+  float bones_timer1 = 0;
+  cudaEventElapsedTime(&bones_timer1,bones_start1,bones_stop1);
+  printf(">>>\t\t (<algorithm_basename>): Execution time [kernel+memcpy]: %.3lf ms \n", bones_timer1);

data/skeletons/GPU-CUDA/common/timer_2_start.c ADDED Viewed

@@ -0,0 +1,6 @@
+  // Start the timer for the measurement of the kernel execution time
+  cudaThreadSynchronize();
+  cudaEvent_t bones_start2;
+  cudaEventCreate(&bones_start2);
+  cudaEventRecord(bones_start2,0);

data/skeletons/GPU-CUDA/common/timer_2_stop.c ADDED Viewed

@@ -0,0 +1,10 @@
+  // Stop the timer for the measurement of the kernel execution time
+  cudaThreadSynchronize();
+  cudaEvent_t bones_stop2;
+  cudaEventCreate(&bones_stop2);
+  cudaEventRecord(bones_stop2,0);
+  cudaEventSynchronize(bones_stop2);
+  float bones_timer2 = 0;
+  cudaEventElapsedTime(&bones_timer2,bones_start2,bones_stop2);
+  printf(">>>\t\t (<algorithm_basename>): Execution time [kernel       ]: %.3lf ms \n", bones_timer2);

data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.host.c ADDED Viewed

@@ -0,0 +1,3 @@
+  // Start the CUDA function
+  bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);

data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu ADDED Viewed

@@ -0,0 +1,105 @@
+/* STARTDEF
+void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
+ENDDEF */
+#define SHUFFLE_X 16
+#define SHUFFLE_Y 16
+// Start of the <algorithm_name> kernel
+__global__ void bones_kernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
+  const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x;
+  if (bones_global_id < (<parallelism>)) {
+    // Calculate the global ID(s) based on the thread id
+    <ids>
+    // Start the computation
+<algorithm_code1>
+  }
+}
+// Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for first input
+__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <in0_type><in0_devicepointer> shuffled_<in0_name>, <argument_definition>) {
+  const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
+  const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
+  // Set-up the local memory for shuffling
+  __shared__ <in0_type> buffer[SHUFFLE_X][SHUFFLE_Y];
+  // Swap the x and y coordinates to perform the rotation (coalesced)
+  if (bones_global_id_0 < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1 < (<in0_parameters>)) {
+    buffer[threadIdx.y][threadIdx.x] = <in0_name>[bones_global_id_0 + bones_global_id_1 * ((<in0_dimensions>)/(<in0_parameters>))];
+  }
+  // Synchronize all threads in the threadblock
+  __syncthreads();
+  // We don't have to swap the x and y thread indices here, because that's already done in the local memory
+  const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
+  const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
+  // Store the shuffled result (coalesced)
+  if (bones_global_id_0_new < ((<in0_dimensions>)/(<in0_parameters>)) && bones_global_id_1_new < (<in0_parameters>)) {
+    shuffled_<in0_name>[bones_global_id_0_new + bones_global_id_1_new * <in0_parameters>] =  buffer[threadIdx.x][threadIdx.y];
+  }
+}
+// Start of the <algorithm_name> kernel (pre-kernel for shuffling) - for second input
+__global__ void bones_kernel_<algorithm_name>_2(<in1_type><in1_devicepointer> <in1_name>, <in1_type><in1_devicepointer> shuffled_<in1_name>, <argument_definition>) {
+  const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x;
+  const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y;
+  // Set-up the local memory for shuffling
+  __shared__ <in1_type> buffer[SHUFFLE_X][SHUFFLE_Y];
+  // Swap the x and y coordinates to perform the rotation (coalesced)
+  if (bones_global_id_0 < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1 < (<in1_parameters>)) {
+    buffer[threadIdx.y][threadIdx.x] = <in1_name>[bones_global_id_0 + bones_global_id_1 * ((<in1_dimensions>)/(<in1_parameters>))];
+  }
+  // Synchronize all threads in the threadblock
+  __syncthreads();
+  // We don't have to swap the x and y thread indices here, because that's already done in the local memory
+  const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x;
+  const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y;
+  // Store the shuffled result (coalesced)
+  if (bones_global_id_0_new < ((<in1_dimensions>)/(<in1_parameters>)) && bones_global_id_1_new < (<in1_parameters>)) {
+    shuffled_<in1_name>[bones_global_id_0_new + bones_global_id_1_new * <in1_parameters>] =  buffer[threadIdx.x][threadIdx.y];
+  }
+}
+// Function to start the kernel
+extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
+  int bones_block_size;
+  if      (<parallelism> >= 64*512) { bones_block_size = 512;}
+  else if (<parallelism> >= 64*256) { bones_block_size = 256;}
+  else if (<parallelism> >= 64*128) { bones_block_size = 128;}
+  else if (<parallelism> >= 64*64 ) { bones_block_size = 64; }
+  else { bones_block_size = 32; }
+  // First perform some pre-shuffling (for the first input)
+  <in0_type>* shuffled_<in0_name> = 0;
+  cudaMalloc((void**)&shuffled_<in0_name>, <in0_dimensions>*sizeof(<in0_type>));
+  dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y);
+  dim3 bones_grid1(DIV_CEIL(((<in0_dimensions>)/(<in0_parameters>)),SHUFFLE_X),DIV_CEIL(<in0_parameters>,SHUFFLE_Y));
+  bones_kernel_<algorithm_name>_1<<< bones_grid1, bones_threads1 >>>(<in0_name>, shuffled_<in0_name>, <argument_name>);
+  <in0_type>* temp_<in0_name> = <in0_name>;
+  <in0_name> = shuffled_<in0_name>;
+  cudaFree(temp_<in0_name>);
+  // First perform some pre-shuffling (for the second input)
+  <in0_type>* shuffled_<in1_name> = 0;
+  cudaMalloc((void**)&shuffled_<in1_name>, <in1_dimensions>*sizeof(<in1_type>));
+  dim3 bones_threads2(SHUFFLE_X,SHUFFLE_Y);
+  dim3 bones_grid2(DIV_CEIL(((<in1_dimensions>)/(<in1_parameters>)),SHUFFLE_X),DIV_CEIL(<in1_parameters>,SHUFFLE_Y));
+  bones_kernel_<algorithm_name>_2<<< bones_grid2, bones_threads2 >>>(<in1_name>, shuffled_<in1_name>, <argument_name>);
+  <in1_type>* temp_<in1_name> = <in1_name>;
+  <in1_name> = shuffled_<in1_name>;
+  cudaFree(temp_<in1_name>);
+  // Then run the original kernel
+  dim3 bones_threads0(bones_block_size);
+  dim3 bones_grid0(DIV_CEIL(<parallelism>,bones_block_size));
+  bones_kernel_<algorithm_name>_0<<< bones_grid0, bones_threads0 >>>(<names>, <argument_name>);
+}

data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.host.c ADDED Viewed

@@ -0,0 +1,3 @@
+  // Start the CUDA function
+  bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);

data/skeletons/GPU-CUDA/kernel/D-element-to-1-shared.kernel.cu ADDED Viewed

@@ -0,0 +1,119 @@
+/* STARTDEF
+void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
+ENDDEF */
+// Start of the <algorithm_name> kernel (main, not unrolled kernel)
+__global__ void bones_kernel_<algorithm_name>_0(int bones_input_size, <in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
+  const int bones_threadblock_work = DIV_CEIL(bones_input_size,gridDim.x);
+  const int bones_parallel_work = BONES_MIN(blockDim.x,bones_threadblock_work);
+  const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work);
+  const int bones_local_id = threadIdx.x;
+  const int bones_global_id = blockIdx.x*bones_parallel_work + threadIdx.x;
+  <ids>
+  int bones_iter_id = <in0_flatindex>;
+  // Load data into thread private memory and perform the first computation(s) sequentially
+  <in0_type> bones_temporary = <in0_name>[bones_iter_id];
+  <in0_type> bones_private_memory = <algorithm_code3>;
+  for(int c=1; c<bones_sequential_work; c++) {
+    bones_iter_id = bones_iter_id + bones_parallel_work*gridDim.x<factors>;
+    if (bones_iter_id <= <in0_to>) {
+      bones_temporary = <in0_name>[bones_iter_id];
+      bones_private_memory = <algorithm_code1>;
+    }
+  }
+  // Initialize the local memory
+  volatile __shared__ <in0_type> bones_local_memory[512];
+  bones_local_memory[bones_local_id] = bones_private_memory;
+  __syncthreads();
+  // Perform the remainder of the computations in parallel using a parallel reduction tree
+  int bones_offset_id;
+  for (int c=512; c>=2; c=c>>1) {
+    if ((2*bones_parallel_work > c) && (threadIdx.x < c/2)) {
+      bones_offset_id = threadIdx.x+c/2;
+      if (bones_offset_id < bones_parallel_work) {
+        __syncthreads();
+        bones_local_memory[bones_local_id] = <algorithm_code2>;
+      }
+    }
+    __syncthreads();
+  }
+  // Write the final result back to the global memory
+  if (threadIdx.x == 0) { <out0_name>[blockIdx.x] = bones_local_memory[0]; }
+}
+// Start of the <algorithm_name> kernel (secondary, not unrolled kernel)
+__global__ void bones_kernel_<algorithm_name>_1(<in0_type><in0_devicepointer> <in0_name>, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
+  const int bones_local_id = threadIdx.x;
+  const int bones_global_id = threadIdx.x;
+  // Initialize the local memory
+  volatile __shared__ <in0_type> bones_local_memory[512];
+  bones_local_memory[bones_local_id] = <in0_name>[bones_global_id];
+  __syncthreads();
+  // Perform reduction using a parallel reduction tree
+  int bones_offset_id;
+  for (int c=128; c>=2; c=c>>1) {
+    if (threadIdx.x < c/2) {
+      bones_offset_id = threadIdx.x+c/2;
+      bones_local_memory[bones_local_id] = <algorithm_code2>;
+      __syncthreads();
+    }
+  }
+  // Write the final result back to the global memory
+  if (threadIdx.x == 0) { <out0_name>[0] = bones_local_memory[0]; }
+}
+// Start of the <algorithm_name> kernel (final, initial value kernel)
+__global__ void bones_kernel_<algorithm_name>_2(<out0_type><out0_devicepointer> bones_initial_value, <out0_type><out0_devicepointer> <out0_name>, <argument_definition>) {
+  <out0_type> bones_private_memory = <out0_name>[0];
+  <out0_type> bones_temporary = bones_initial_value[0];
+  <out0_name>[0] = <algorithm_code4>;
+}
+// Function to start the kernel
+extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
+  // Store the initial value
+  <out0_type>* bones_initial_value = 0;
+  cudaMalloc(&bones_initial_value, sizeof(<out0_type>));
+  cudaMemcpy(bones_initial_value, <out0_name>, sizeof(<out0_type>), cudaMemcpyDeviceToDevice);
+  // Run either one kernel or multiple kernels
+  if (<in0_dimensions> <= 1024) {
+    // Start only one kernel
+    const int bones_num_threads = DIV_CEIL(<in0_dimensions>,2);
+    dim3 bones_threads(bones_num_threads);
+    dim3 bones_grid(1);
+    bones_kernel_<algorithm_name>_0<<< bones_grid, bones_threads >>>(<in0_dimensions>,<in0_name>,<out0_name>,<argument_name>);
+  }
+  else {
+    // Allocate space for an intermediate array
+    <out0_type>* bones_device_temp = 0;
+    cudaMalloc(&bones_device_temp, 128*sizeof(<out0_type>));
+    // Start the first kernel
+    dim3 bones_threads1(512);
+    dim3 bones_grid1(128);
+    bones_kernel_<algorithm_name>_0<<< bones_grid1, bones_threads1 >>>(<in0_dimensions>,<in0_name>,bones_device_temp,<argument_name>);
+    // Start the second kernel
+    dim3 bones_threads2(128);
+    dim3 bones_grid2(1);
+    bones_kernel_<algorithm_name>_1<<< bones_grid2, bones_threads2 >>>(bones_device_temp,<out0_name>,<argument_name>);
+    cudaFree(bones_device_temp);
+  }
+  // Perform the last computation (only needed if there is an initial value)
+  dim3 bones_threads3(1);
+  dim3 bones_grid3(1);
+  bones_kernel_<algorithm_name>_2<<< bones_grid3, bones_threads3 >>>(bones_initial_value,<out0_name>,<argument_name>);
+  cudaFree(bones_initial_value);
+}

data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.host.c ADDED Viewed

@@ -0,0 +1,3 @@
+  // Start the CUDA function
+  bones_prekernel_<algorithm_name>_0(<devicenames>, <argument_name>);

data/skeletons/GPU-CUDA/kernel/D-element-to-N-shared.kernel.cu ADDED Viewed

@@ -0,0 +1,166 @@
+/* STARTDEF
+void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>);
+ENDDEF */
+template<int SCALE>
+__global__ void bones_kernel_<algorithm_name>_0(int *<in0_name>_index, <in0_type> *<in0_name>_value, int *<out0_name>, int const votecount)
+{
+	int nbins = <out0_dimension0_sum>;
+	int nbins_part = ceilf((float)nbins / gridDim.y);
+	int part_offset = blockIdx.y * nbins_part;
+	//init temp. vote line in shared memory
+	extern __shared__ int votespace_line[];
+	for(int i=threadIdx.x; i<nbins_part*SCALE; i+=1024)
+		votespace_line[i] = 0;
+	__syncthreads();
+	// calculate start and stop index of input for sub-vote spaces
+	int start_index =       blockIdx.z   *votecount/gridDim.z + threadIdx.x;
+	int stop_index =  min( (blockIdx.z+1)*votecount/gridDim.z , votecount);
+	for(int i=start_index; i<stop_index; i+=1024)
+	{
+		//int arr_val_index = <in0_name>_index[i];
+		<in0_type> arr_val_value = <in0_name>_value[i];
+		int vote_index = (int)((arr_val_value & 0x00FF) * (nbins / 256.0f));
+		vote_index = SCALE*vote_index + (threadIdx.x & (SCALE-1)) - part_offset;
+		int vote_value = 1; // Vote value
+		if(vote_index<(nbins_part*SCALE) && vote_index>=0)
+			atomicAdd(&votespace_line[vote_index], vote_value);
+	}
+	__syncthreads();
+	for(int i=threadIdx.x; i<nbins_part; i+=1024)
+	{
+		int value=0;
+		#pragma unroll
+		for(int j=0; j<SCALE; j++)
+			value += votespace_line[SCALE*i+j];
+		<out0_name>[blockIdx.z*nbins*gridDim.x +
+				  blockIdx.x*nbins           +
+		          blockIdx.y*nbins_part      + i] = value;
+	}
+}
+__global__ void bones_kernel_<algorithm_name>_1(int *in, int *out, int const num_subvotespaces, int const nbins)
+{
+	// Identify the thread
+	int p = blockIdx.x*blockDim.x + threadIdx.x;
+	if(p>nbins)
+		return;
+	// Sum the sub-votespaces
+	int result = 0;
+	#pragma unroll
+	for (int i=0;i<num_subvotespaces;i++) {
+		result += in[blockIdx.y*num_subvotespaces*nbins + i*nbins + p];
+	}
+	// Write the results to off-chip memory
+	out[blockIdx.y*nbins + p] = result;
+}
+// Function to start the kernel
+extern "C" void bones_prekernel_<algorithm_name>_0(<devicedefinitions>, <argument_definition>) {
+	int * gpu_array_index = 0;
+	<in0_type> *gpu_array_value = <in0_name>;
+	int cpu_votecount = <in0_dimensions>;
+	int *gpu_votespace = (int*)<out0_name>;
+	int *gpu_temp = 0;
+	int nbins = <out0_dimension0_sum>;
+	int number_multiprocessors = 14;
+	int nbingroups = 1;
+	int scaling=8192/nbins;
+	int split_in_parts = 1;
+	int subvotespaces = 1;
+	int *gpu_out;
+	//calculate the scaling factor, and limit it to the values 1, 2, 4, 8, 16, 32, 64 and 128
+	if(scaling < 1)	{
+		//too many bins requested, no scaling but splitting
+		scaling = 1;
+		split_in_parts = ceil(nbins / 8192.0f);
+	}
+	else if (scaling > 256)	{
+		scaling = 256;
+	}
+	else	{
+		int mask = 8192;
+		while(0 == (mask & scaling))
+			mask >>= 1;
+		scaling = mask;
+	}
+	if( (nbingroups*split_in_parts) < number_multiprocessors)	{
+		int const maxsub = ceil((float)(<in0_dimensions>) / (float)(32*250));
+		cudaMalloc((void**)&gpu_temp, maxsub*nbingroups*nbins*sizeof(int));
+		if (gpu_temp != NULL) {
+			subvotespaces = number_multiprocessors / (nbingroups*split_in_parts);
+			gpu_out = gpu_temp;
+		}
+		else {
+			gpu_out = gpu_votespace;
+		}
+	}
+	else
+	{
+		gpu_out = gpu_votespace;
+	}
+	//scaling = 256;
+	//printf("%d %d %d %d %d\n", nbins, scaling, nbingroups, split_in_parts, subvotespaces);
+	dim3 dimensionsBlock1(1024);
+	dim3 dimensionsGrid1(nbingroups, split_in_parts, subvotespaces);
+	int const nbins_part = ceilf((float)nbins / split_in_parts);
+	switch(scaling) {
+		case 256:
+			bones_kernel_<algorithm_name>_0<256><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 128:
+			bones_kernel_<algorithm_name>_0<128><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 64:
+			bones_kernel_<algorithm_name>_0< 64><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 32:
+			bones_kernel_<algorithm_name>_0< 32><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 16:
+			bones_kernel_<algorithm_name>_0< 16><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 8:
+			bones_kernel_<algorithm_name>_0<  8><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 4:
+			bones_kernel_<algorithm_name>_0<  4><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		case 2:
+			bones_kernel_<algorithm_name>_0<  2><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+		default:
+			bones_kernel_<algorithm_name>_0<  1><<<dimensionsGrid1, dimensionsBlock1, scaling*nbins_part*sizeof(int)>>>
+			(gpu_array_index, gpu_array_value, gpu_out, cpu_votecount);
+			break;
+	}
+	if(subvotespaces > 1)	{
+		dim3 dimensionsBlock2(min(nbins,1024));
+		dim3 dimensionsGrid2(ceil((float)nbins/(float)1024), nbingroups);
+		bones_kernel_<algorithm_name>_1<<<dimensionsGrid2, dimensionsBlock2>>>(gpu_out, gpu_votespace, subvotespaces, nbins);
+		cudaFree(gpu_temp);
+	}
+}