bones-compiler 1.1.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (211) hide show
  1. checksums.yaml +15 -0
  2. data/CHANGELOG +37 -0
  3. data/LICENSE +1 -1
  4. data/README.rdoc +95 -70
  5. data/Rakefile +78 -3
  6. data/VERSION +1 -1
  7. data/bin/adarwin +17 -0
  8. data/examples/benchmarks/PolyBench/2mm.c +104 -0
  9. data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
  10. data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
  11. data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
  12. data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
  13. data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
  14. data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
  15. data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
  16. data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
  17. data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
  18. data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
  19. data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
  20. data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
  21. data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
  22. data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
  23. data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
  24. data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
  25. data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
  26. data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
  27. data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
  28. data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
  29. data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
  30. data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
  31. data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
  32. data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
  33. data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
  34. data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
  35. data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
  36. data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
  37. data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
  38. data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
  39. data/examples/benchmarks/Rodinia/cfd.c +180 -0
  40. data/examples/benchmarks/Rodinia/hotspot.c +228 -0
  41. data/examples/benchmarks/Rodinia/kmeans.c +164 -0
  42. data/examples/benchmarks/Rodinia/srad.c +188 -0
  43. data/examples/benchmarks/other/common.h +0 -0
  44. data/examples/benchmarks/other/dct.c +58 -0
  45. data/examples/benchmarks/other/mm.c +50 -0
  46. data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
  47. data/examples/chunk/{example1.c → example01.c} +0 -0
  48. data/examples/chunk/{example2.c → example02.c} +0 -0
  49. data/examples/chunk/{example3.c → example03.c} +0 -0
  50. data/examples/chunk/{example4.c → example04.c} +0 -0
  51. data/examples/chunk/{example5.c → example05.c} +0 -0
  52. data/examples/chunk/example06.c +45 -0
  53. data/examples/chunk/example07.c +49 -0
  54. data/examples/dependences/example01.c +42 -0
  55. data/examples/dependences/example02.c +40 -0
  56. data/examples/dependences/example03.c +43 -0
  57. data/examples/dependences/example04.c +44 -0
  58. data/examples/dependences/example05.c +42 -0
  59. data/examples/element/{example1.c → example01.c} +0 -0
  60. data/examples/element/{example2.c → example02.c} +2 -2
  61. data/examples/element/{example3.c → example03.c} +0 -0
  62. data/examples/element/{example4.c → example04.c} +0 -0
  63. data/examples/element/{example5.c → example05.c} +0 -0
  64. data/examples/element/{example6.c → example06.c} +0 -0
  65. data/examples/element/{example7.c → example07.c} +0 -0
  66. data/examples/element/{example8.c → example08.c} +0 -0
  67. data/examples/element/{example9.c → example09.c} +0 -0
  68. data/examples/element/example13.c +73 -0
  69. data/examples/fusion/example01.c +68 -0
  70. data/examples/fusion/example02.c +73 -0
  71. data/examples/fusion/example03.c +72 -0
  72. data/examples/fusion/example04.c +61 -0
  73. data/examples/fusion/example05.c +55 -0
  74. data/examples/neighbourhood/{example1.c → example01.c} +0 -0
  75. data/examples/neighbourhood/{example2.c → example02.c} +0 -0
  76. data/examples/neighbourhood/{example3.c → example03.c} +0 -0
  77. data/examples/neighbourhood/{example4.c → example04.c} +0 -0
  78. data/examples/neighbourhood/example05.c +44 -0
  79. data/examples/shared/{example1.c → example01.c} +0 -0
  80. data/examples/shared/{example2.c → example02.c} +0 -0
  81. data/examples/shared/{example3.c → example03.c} +0 -0
  82. data/examples/shared/{example4.c → example04.c} +0 -0
  83. data/examples/shared/{example5.c → example05.c} +0 -0
  84. data/lib/adarwin.rb +62 -0
  85. data/lib/adarwin/dependences.rb +268 -0
  86. data/lib/adarwin/engine.rb +277 -0
  87. data/lib/adarwin/fusion.rb +174 -0
  88. data/lib/adarwin/interval.rb +57 -0
  89. data/lib/adarwin/memorycopies.rb +153 -0
  90. data/lib/adarwin/nest.rb +225 -0
  91. data/lib/adarwin/preprocessor.rb +76 -0
  92. data/lib/adarwin/reference.rb +261 -0
  93. data/lib/bones.rb +4 -55
  94. data/lib/bones/algorithm.rb +77 -40
  95. data/lib/bones/copy.rb +26 -0
  96. data/lib/bones/engine.rb +147 -31
  97. data/lib/bones/preprocessor.rb +92 -12
  98. data/lib/bones/species.rb +4 -3
  99. data/lib/bones/structure.rb +14 -4
  100. data/lib/castaddon.rb +11 -6
  101. data/lib/castaddon/node_adarwin.rb +245 -0
  102. data/lib/castaddon/node_bones.rb +316 -0
  103. data/lib/castaddon/node_common.rb +289 -0
  104. data/lib/castaddon/transformations.rb +236 -0
  105. data/lib/common.rb +216 -0
  106. data/skeletons/CPU-C/common/header.c +3 -0
  107. data/skeletons/CPU-C/common/mem_global.c +0 -0
  108. data/skeletons/CPU-C/common/timer_2_start.c +11 -13
  109. data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
  110. data/skeletons/CPU-C/common/timer_globals.c +29 -0
  111. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
  112. data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
  114. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
  115. data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
  117. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
  118. data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
  119. data/skeletons/CPU-OPENMP/common/globals.c +1 -0
  120. data/skeletons/CPU-OPENMP/common/header.c +3 -0
  121. data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
  122. data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
  123. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
  124. data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
  125. data/skeletons/GPU-CUDA/common/globals.c +27 -3
  126. data/skeletons/GPU-CUDA/common/header.c +2 -0
  127. data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
  128. data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
  129. data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
  130. data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
  131. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
  132. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
  133. data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
  134. data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
  135. data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
  136. data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
  137. data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
  138. data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
  139. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
  140. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
  141. data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
  142. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
  143. data/skeletons/GPU-CUDA/skeletons.txt +6 -5
  144. data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
  145. data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
  146. data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
  147. data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
  148. data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
  149. data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
  150. data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
  151. data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
  152. data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
  153. data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
  154. data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
  155. data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
  156. data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
  157. data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
  158. data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
  159. data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
  160. data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
  161. data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
  162. data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
  163. data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
  164. data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
  165. data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
  166. data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
  167. data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
  168. data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
  169. data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
  170. data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
  171. data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
  172. data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
  173. data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
  174. data/test/examples/chunk/example01_species.c +58 -0
  175. data/test/examples/chunk/example02_species.c +48 -0
  176. data/test/examples/chunk/example03_species.c +63 -0
  177. data/test/examples/chunk/example04_species.c +58 -0
  178. data/test/examples/chunk/example05_species.c +56 -0
  179. data/test/examples/chunk/example06_species.c +49 -0
  180. data/test/examples/chunk/example07_species.c +53 -0
  181. data/test/examples/dependences/example01_species.c +46 -0
  182. data/test/examples/dependences/example02_species.c +44 -0
  183. data/test/examples/dependences/example03_species.c +47 -0
  184. data/test/examples/dependences/example04_species.c +48 -0
  185. data/test/examples/dependences/example05_species.c +46 -0
  186. data/test/examples/element/example01_species.c +50 -0
  187. data/test/examples/element/example02_species.c +50 -0
  188. data/test/examples/element/example03_species.c +62 -0
  189. data/test/examples/element/example04_species.c +53 -0
  190. data/test/examples/element/example05_species.c +59 -0
  191. data/test/examples/element/example06_species.c +50 -0
  192. data/test/examples/element/example07_species.c +58 -0
  193. data/test/examples/element/example08_species.c +49 -0
  194. data/test/examples/element/example09_species.c +52 -0
  195. data/test/examples/element/example10_species.c +54 -0
  196. data/test/examples/element/example11_species.c +51 -0
  197. data/test/examples/element/example12_species.c +60 -0
  198. data/test/examples/element/example13_species.c +77 -0
  199. data/test/examples/neighbourhood/example01_species.c +57 -0
  200. data/test/examples/neighbourhood/example02_species.c +56 -0
  201. data/test/examples/neighbourhood/example03_species.c +83 -0
  202. data/test/examples/neighbourhood/example04_species.c +55 -0
  203. data/test/examples/neighbourhood/example05_species.c +48 -0
  204. data/test/examples/shared/example01_species.c +49 -0
  205. data/test/examples/shared/example02_species.c +55 -0
  206. data/test/examples/shared/example03_species.c +59 -0
  207. data/test/examples/shared/example04_species.c +56 -0
  208. data/test/examples/shared/example05_species.c +52 -0
  209. metadata +193 -73
  210. data/examples/benchmarks/overview.txt +0 -38
  211. data/lib/castaddon/node.rb +0 -753
data/lib/bones/copy.rb ADDED
@@ -0,0 +1,26 @@
1
+
2
+ module Bones
3
+
4
+ # Class copyin/out
5
+ class Copy
6
+ attr_accessor :name, :domain, :deadline, :direction, :id
7
+
8
+ def initialize(name,domain,deadline,direction,id)
9
+ @name = name
10
+ @domain = domain
11
+ @deadline = deadline
12
+ @direction = direction
13
+ @id = id
14
+ end
15
+
16
+ def get_definition(array_definition,type)
17
+ array_definition = '' if type == 'free' || type == 'alloc'
18
+ 'void bones_'+type+'_'+@id+'_'+@name+'('+array_definition+');'
19
+ end
20
+
21
+ def get_function_call(type)
22
+ 'bones_'+type+'_'+@id+'_'+@name+'();'
23
+ end
24
+ end
25
+
26
+ end
data/lib/bones/engine.rb CHANGED
@@ -17,13 +17,17 @@ module Bones
17
17
  # A list of timer files to be found in the skeleton library.
18
18
  TIMER_FILES = ['timer_1_start','timer_1_stop','timer_2_start','timer_2_stop']
19
19
  # A list of files to be found in the common directory of the skeleton library (excluding timer files).
20
- COMMON_FILES = ['prologue','epilogue','mem_prologue','mem_copy_H2D','mem_copy_D2H','mem_epilogue']
20
+ COMMON_FILES = ['prologue','epilogue','mem_prologue','mem_copy_H2D','mem_copy_D2H','mem_epilogue','mem_global']
21
21
  # The name of the file containing the globals as found in the skeleton library
22
22
  COMMON_GLOBALS = 'globals'
23
23
  # The name of the file containing the header file for the original C code as found in the skeleton library
24
24
  COMMON_HEADER = 'header'
25
25
  # The name of the file containing the globals for the kernel files as found in the skeleton library
26
26
  COMMON_GLOBALS_KERNEL = 'globals_kernel'
27
+ # The name of the file containing the scheduler code
28
+ COMMON_SCHEDULER = 'scheduler'
29
+ # Global timers
30
+ GLOBAL_TIMERS = 'timer_globals'
27
31
 
28
32
  # The extension of a host file in the skeleton library. See also SKELETON_DEVICE.
29
33
  SKELETON_HOST = '.host'
@@ -54,13 +58,15 @@ module Bones
54
58
  # --help, -h: Show this message
55
59
  #
56
60
  def initialize
57
- @result = {:original_code => [],
58
- :header_code => [],
59
- :host_declarations => [],
60
- :host_code_lists => [],
61
- :algorithm_declarations => [],
62
- :algorithm_code_lists => [],
63
- :verify_code => []}
61
+ @result = {:original_code => [],
62
+ :header_code => [],
63
+ :host_declarations => [],
64
+ :host_code_lists => [],
65
+ :algorithm_declarations => [],
66
+ :algorithm_code_lists => [],
67
+ :verify_code => [],
68
+ :host_device_mem_globals => []}
69
+ @state = 0
64
70
 
65
71
  # Provides a list of possible targets (e.g. GPU-CUDA, 'CPU-OPENCL-INTEL').
66
72
  targets = []
@@ -86,6 +92,9 @@ module Bones
86
92
  opt :verify, 'Verify correctness of the generated code', :short => 'c', :default => false
87
93
  opt :only_alg_number, 'Only generate code for the x-th species (99 -> all)', :short => 'o', :type => Integer, :default => 99
88
94
  opt :merge_factor, 'Thread merge factor, default is 1 (==disabled)', :short => 'f', :type => Integer, :default => 1
95
+ opt :register_caching,'Enable register caching: 1:enabled (default), 0:disabled', :short => 'r', :type => Integer, :default => 1
96
+ opt :zero_copy ,'Enable OpenCL zero-copy: 1:enabled (default), 0:disabled', :short => 'z', :type => Integer, :default => 1
97
+ opt :skeletons ,'Enable non-default skeletons: 1:enabled (default), 0:disabled', :short => 's', :type => Integer, :default => 1
89
98
  end
90
99
  Trollop::die 'no input file supplied (use: --application)' if !@options[:application_given]
91
100
  Trollop::die 'no target supplied (use: --target)' if !@options[:target_given]
@@ -103,6 +112,12 @@ module Bones
103
112
  # Set a prefix for functions called from the original file but defined in a host file
104
113
  @prefix = (@options[:target] == 'GPU-CUDA') ? '' : ''
105
114
 
115
+ # Setting to include the scheduler (CUDA only)
116
+ @scheduler = (@options[:target] == 'GPU-CUDA') ? true : false
117
+
118
+ # Skip analyse passes for certain targets
119
+ @skiptarget = false #(@options[:target] == 'PAR4ALL') ? true : false
120
+
106
121
  # Set the location for the skeleton library
107
122
  @dir = {}
108
123
  @dir[:library] = File.join(BONES_DIR_SKELETONS,@options[:target])
@@ -125,7 +140,7 @@ module Bones
125
140
  def process
126
141
 
127
142
  # Run the preprocessor
128
- preprocessor = Bones::Preprocessor.new(@source,File.dirname(@options[:application]),@basename)
143
+ preprocessor = Bones::Preprocessor.new(@source,File.dirname(@options[:application]),@basename,@scheduler)
129
144
  preprocessor.process
130
145
  @result[:header_code] = preprocessor.header_code
131
146
  @result[:device_header] = preprocessor.device_header
@@ -137,11 +152,20 @@ module Bones
137
152
  parser.type_names << 'size_t'
138
153
  ast = parser.parse(preprocessor.target_code)
139
154
  ast.preprocess
155
+
156
+ # Add the scheduler's global code
157
+ if @scheduler
158
+ @result[:host_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_SCHEDULER+@extension)))
159
+ end
140
160
 
141
161
  # Set the algorithm's skeleton and generate the global code
142
162
  one_time = true
143
163
  preprocessor.algorithms.each_with_index do |algorithm,algorithm_number|
144
164
  algorithm.species.set_skeleton(File.join(@dir[:library],SKELETON_FILE))
165
+ if @options[:skeletons] == 0
166
+ algorithm.species.skeleton_name = 'default'
167
+ algorithm.species.settings.gsub!('10','00').gsub!('20','00').gsub!('30','00')
168
+ end
145
169
  if algorithm.species.skeleton_name && one_time
146
170
  @result[:host_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_GLOBALS+@extension)))
147
171
  @result[:algorithm_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_GLOBALS_KERNEL+@extension)))
@@ -149,24 +173,60 @@ module Bones
149
173
  end
150
174
  end
151
175
 
152
- # Perform code generation
176
+ # Perform code generation (per-species code)
153
177
  @result[:original_code] = ast
178
+ arrays = []
154
179
  preprocessor.algorithms.each_with_index do |algorithm,algorithm_number|
155
180
  if @options[:only_alg_number] == 99 || algorithm_number == [@options[:only_alg_number],preprocessor.algorithms.length-1].min
156
181
  puts MESSAGE+'Starting code generation for algorithm "'+algorithm.name+'"'
157
182
  if algorithm.species.skeleton_name
158
183
  algorithm.merge_factor = @options[:merge_factor] if (@options[:target] == 'GPU-CUDA')
184
+ algorithm.register_caching_enabled = @options[:register_caching]
159
185
  algorithm.set_function(ast)
160
- algorithm.populate_variables(ast,preprocessor.defines)
186
+ algorithm.populate_variables(ast,preprocessor.defines) if !@skiptarget
161
187
  algorithm.populate_lists()
162
- algorithm.populate_hash()
188
+ algorithm.populate_hash() if !@skiptarget
163
189
  generate(algorithm)
164
190
  puts MESSAGE+'Code generated using the "'+algorithm.species.skeleton_name+'" skeleton'
191
+ arrays.concat(algorithm.arrays)
165
192
  else
166
193
  puts WARNING+'Skeleton "'+algorithm.species.name+'" not available'
167
194
  end
168
195
  end
169
196
  end
197
+
198
+ # Only if the scheduler is included
199
+ if @scheduler
200
+
201
+ # Perform code generation (sync statements)
202
+ @result[:host_declarations].push('void bones_synchronize(int bones_task_id);')
203
+
204
+ # Perform code generation (memory allocs)
205
+ allocs = []
206
+ preprocessor.copies.each do |copy|
207
+ if !allocs.include?(copy.name)
208
+ generate_memory('alloc',copy,arrays,0)
209
+ allocs << copy.name
210
+ end
211
+ end
212
+
213
+ # Perform code generation (memory copies)
214
+ preprocessor.copies.each_with_index do |copy,index|
215
+ #puts MESSAGE+'Generating copy code for array "'+copy.name+'"'
216
+ generate_memory('copy',copy,arrays,index)
217
+ end
218
+
219
+ # Perform code generation (memory frees)
220
+ frees = []
221
+ preprocessor.copies.each do |copy|
222
+ if !frees.include?(copy.name)
223
+ generate_memory('free',copy,arrays,0)
224
+ frees << copy.name
225
+ end
226
+ end
227
+
228
+ end
229
+
170
230
  end
171
231
 
172
232
  # This method writes the output code to files. It creates
@@ -202,7 +262,7 @@ module Bones
202
262
  end
203
263
  end
204
264
 
205
- # Populate the verification file4
265
+ # Populate the verification file
206
266
  if @options[:verify]
207
267
  File.open(File.join(directory,@options[:name]+OUTPUT_VERIFICATION+@extension),'w') do |verification|
208
268
  verification.puts @result[:header_code]
@@ -212,15 +272,22 @@ module Bones
212
272
  end
213
273
  end
214
274
 
215
- # Populate the target file
275
+ # Populate the target file (host)
216
276
  File.open(File.join(directory,@options[:name]+OUTPUT_HOST+@extension),'w') do |target|
277
+ target.puts '#include <cuda_runtime.h>'+NL if @options[:target] == 'GPU-CUDA'
278
+ target.puts "#define ZEROCOPY 0"+NL if @options[:zero_copy] == 0 && @options[:target] == 'CPU-OPENCL-INTEL'
279
+ target.puts "#define ZEROCOPY 1"+NL if @options[:zero_copy] == 1 && @options[:target] == 'CPU-OPENCL-INTEL'
217
280
  target.puts @result[:header_code]
218
- target.puts @result[:algorithm_declarations]
219
281
  target.puts
282
+ target.puts @result[:host_device_mem_globals]
283
+ target.puts
284
+ target.puts @result[:algorithm_declarations]
220
285
  target.puts @result[:host_code_lists]
286
+ target.puts
287
+ target.puts File.read(File.join(@dir[:common_library],GLOBAL_TIMERS+@extension))
221
288
  end
222
289
 
223
- # Populate the algorithm file
290
+ # Populate the algorithm file (device)
224
291
  File.open(File.join(directory,@options[:name]+OUTPUT_DEVICE+@algorithm_extension),'w') do |algorithm|
225
292
  algorithm.puts @result[:device_header]
226
293
  algorithm.puts @result[:algorithm_code_lists]
@@ -251,7 +318,7 @@ module Bones
251
318
  :device => File.read(file_name_device+@algorithm_extension)}
252
319
 
253
320
  # Perform the transformations on the algorithm's code
254
- algorithm.perform_transformations(algorithm.species.settings)
321
+ algorithm.perform_transformations(algorithm.species.settings) if !@skiptarget
255
322
 
256
323
  # Load the common skeletons from the skeleton library
257
324
  COMMON_FILES.each do |skeleton|
@@ -291,13 +358,19 @@ module Bones
291
358
  minihash = { :array => array.name,
292
359
  :type => array.type_name,
293
360
  :flatten => array.flatten,
294
- :variable_dimensions => array.size.join('*')}
361
+ :variable_dimensions => array.size.join('*'),
362
+ :state => @state.to_s}
363
+ @state += 1
295
364
 
296
365
  # Apply the mini-search-and-replace hash to create the memory allocations, memory copies (if input only), etc.
297
366
  processed[:mem_prologue] += search_and_replace(minihash,skeletons[:mem_prologue])
298
367
  processed[:mem_copy_H2D] += search_and_replace(minihash,skeletons[:mem_copy_H2D]) if array.input? || array.species.shared?
299
368
  processed[:mem_epilogue] += search_and_replace(minihash,skeletons[:mem_epilogue])
369
+
370
+ # Add the device declarations
371
+ @result[:host_device_mem_globals].push(search_and_replace(minihash,skeletons[:mem_global]))
300
372
  end
373
+
301
374
  # Iterate over all the array variables and create a mini-search-and-replace hash for each array (output arrays)
302
375
  algorithm.arrays.select(OUTPUT).each_with_index do |array, num_array|
303
376
  hash = algorithm.hash["out#{num_array}".to_sym]
@@ -305,7 +378,9 @@ module Bones
305
378
  :type => array.type_name,
306
379
  :flatten => array.flatten,
307
380
  :offset => '('+hash[:dimension0][:from]+')',
308
- :variable_dimensions => '('+hash[:dimensions]+')'}
381
+ :variable_dimensions => '('+hash[:dimensions]+')',
382
+ :state => @state.to_s}
383
+ @state += 1
309
384
 
310
385
  # Perform selective copy for arrays with 2 dimensions (uses a for-loop over the memory copies)
311
386
  if array.dimensions == 2 && @options[:target] == 'GPU-CUDA' && false
@@ -346,11 +421,17 @@ module Bones
346
421
  search_and_replace!(algorithm.hash,skeletons[:epilogue])
347
422
 
348
423
  # Construct the final host function, inluding the timers and memory copies
349
- host = skeletons[:prologue ] + skeletons[:timer_1_start] +
350
- processed[:mem_prologue ] + processed[:mem_copy_H2D ] +
351
- skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] +
352
- processed[:mem_copy_D2H ] + processed[:mem_epilogue ] +
353
- skeletons[:timer_1_stop ] + skeletons[:epilogue ]
424
+ if @scheduler
425
+ host = skeletons[:prologue ] +
426
+ skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] +
427
+ skeletons[:epilogue ]
428
+ else
429
+ host = skeletons[:prologue ] +
430
+ skeletons[:timer_1_start] + processed[:mem_prologue ] + processed[:mem_copy_H2D ] +
431
+ skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] +
432
+ processed[:mem_copy_D2H ] + processed[:mem_epilogue ] + skeletons[:timer_1_stop ] +
433
+ skeletons[:epilogue ]
434
+ end
354
435
 
355
436
  # Generate code to replace the original code, including verification code if specified by the option flag
356
437
  verify_skeleton = File.read(File.join(@dir[:verify_library],'verify_results.c'))
@@ -362,24 +443,59 @@ module Bones
362
443
  # Add a performance model to the original code
363
444
  #replacement_code.insert(0,algorithm.performance_model_code('model'))
364
445
 
365
- # Replace mallocs and frees in the original code with aligned memory allocations (only for CPU-OpenCL targets)
366
- if @options[:target] == 'CPU-OPENCL-INTEL'
367
- @result[:original_code].seach_and_replace_function_call(C::Variable.parse('malloc'),C::Variable.parse(VARIABLE_PREFIX+'malloc_128'))
368
- @result[:original_code].seach_and_replace_function_call(C::Variable.parse('free'),C::Variable.parse(VARIABLE_PREFIX+'free_128'))
446
+ # Replace mallocs and frees in the original code with aligned memory allocations (only for CPU-OpenCL targets with zero-copy)
447
+ if @options[:zero_copy] == 1 && @options[:target] == 'CPU-OPENCL-INTEL'
448
+ @result[:original_code].search_and_replace_function_call(C::Variable.parse('malloc'),C::Variable.parse(VARIABLE_PREFIX+'malloc_128'))
449
+ @result[:original_code].search_and_replace_function_call(C::Variable.parse('free'),C::Variable.parse(VARIABLE_PREFIX+'free_128'))
369
450
  end
370
451
 
371
452
  # Give the original main function a new name
372
- @result[:original_code].seach_and_replace_function_definition('main',VARIABLE_PREFIX+'main')
453
+ @result[:original_code].search_and_replace_function_definition('main',VARIABLE_PREFIX+'main')
373
454
 
374
455
  # Replace the original code with a function call to the newly generated code
375
- @result[:original_code].seach_and_replace_node(algorithm.code,replacement_code)
456
+ @result[:original_code].search_and_replace_node(algorithm.code,replacement_code)
376
457
 
377
458
  # The host code is generated, push the data to the output hashes
378
459
  accelerated_definition = 'void '+algorithm.name+'_accelerated('+algorithm.lists[:host_definition]+')'
379
- @result[:host_code_lists].push(@prefix+accelerated_definition+' {'+NL+host+NL+'}')
460
+ @result[:host_code_lists].push(@prefix+accelerated_definition+' {'+NL+host+NL+'}'+NL+NL)
380
461
  @result[:host_declarations].push(@prefix+accelerated_definition+';'+NL+@prefix+original_definition+';')
381
462
  end
382
463
 
464
+
465
+ def generate_memory(type,copy,arrays,index)
466
+
467
+ # Find the corresponding array
468
+ arrays.each do |array|
469
+ if array.name == copy.name && (array.direction == copy.direction || array.direction == INOUT)
470
+
471
+ # Load the skeleton from the skeleton library
472
+ type += copy.direction if type == 'copy'
473
+ skeleton = File.read(File.join(@dir[:common_library],'mem_async_'+type+@extension))
474
+
475
+ # Create the find-and-replace hash
476
+ minihash = { :array => copy.name,
477
+ :id => copy.id,
478
+ :index => index.to_s,
479
+ :direction => copy.direction,
480
+ :definition => array.definition,
481
+ :type => array.type_name,
482
+ :flatten => array.flatten,
483
+ :offset => '0',
484
+ :variable_dimensions => array.size.join('*'),
485
+ :state => copy.deadline}
486
+
487
+ # Instanstiate the skeleton and add it to the final result
488
+ @result[:host_code_lists].push(search_and_replace(minihash,skeleton))
489
+
490
+ # Add a forward declaration of this function
491
+ @result[:host_declarations].push(copy.get_definition(array.definition,type))
492
+
493
+ # Done
494
+ return
495
+ end
496
+ end
497
+ end
498
+
383
499
  end
384
500
 
385
501
  end
@@ -9,7 +9,7 @@ module Bones
9
9
  # * +algorithms+ - An array of identified algorithms, each of class Bones::Algorithm.
10
10
  # * +target_code+ - The processed code containing no Bones directives nor other pre-processor directives (such as includes and defines).
11
11
  class Preprocessor < Common
12
- attr_reader :header_code, :algorithms, :target_code, :device_header, :defines
12
+ attr_reader :header_code, :algorithms, :target_code, :device_header, :defines, :scop, :copies
13
13
 
14
14
  # Denotes the start of an algorithmic species.
15
15
  IDENTIFIER = '#pragma species'
@@ -18,9 +18,22 @@ module Bones
18
18
  WHITESPACE = '\s*'
19
19
 
20
20
  # This directive denotes the start of a algorithm. It is based on the IDENTIFIER constant.
21
- PRIMITIVE_START = IDENTIFIER+' kernel'
21
+ SPECIES_START = IDENTIFIER+' kernel'
22
22
  # This directive denotes the end of a algorithm. It is based on the IDENTIFIER constant.
23
- PRIMITIVE_END = IDENTIFIER+' endkernel'
23
+ SPECIES_END = IDENTIFIER+' endkernel'
24
+
25
+ # Start of the scop
26
+ SCOP_START = '#pragma scop'
27
+ # Enf of the scop
28
+ SCOP_END = '#pragma endscop'
29
+
30
+ # Synchronise directive.
31
+ SYNC = IDENTIFIER+' sync'
32
+
33
+ # Copy in directive.
34
+ COPYIN = IDENTIFIER+ ' copyin'
35
+ # Copy out directive.
36
+ COPYOUT = IDENTIFIER+ ' copyout'
24
37
 
25
38
  # A regular expression captures a prefix in a algorithm (e.g. unordered/multiple).
26
39
  REGEXP_PREFIX = /^[a-z]+ /
@@ -31,16 +44,18 @@ module Bones
31
44
  # This is the method which initializes the preprocessor.
32
45
  # Initialization requires the target source code to process,
33
46
  # which is then set as the class variable +@source_code+.
34
- def initialize(source_code,directory,filename)
47
+ def initialize(source_code,directory,filename,scheduler)
35
48
  @source_code = source_code
36
- @target_code = ''
49
+ @target_code = []
37
50
  @header_code = ''
38
51
  @device_header = ''
39
52
  @directory = directory
40
53
  @filename = filename
41
54
  @algorithms = Array.new
55
+ @copies = Array.new
42
56
  @defines = {}
43
57
  @found_algorithms = 0
58
+ @scheduler = scheduler
44
59
  end
45
60
 
46
61
  # This is the method to perform the actual preprocessing.
@@ -51,6 +66,7 @@ module Bones
51
66
  algorithm_code = ''
52
67
  species = nil
53
68
  found = 0
69
+ alloc_index, free_index = 0, 0
54
70
 
55
71
  # Process the file line by line
56
72
  @source_code.each_line.with_index do |line,index|
@@ -71,7 +87,7 @@ module Bones
71
87
  @defines[match.first[0].to_sym] = match.first[1]
72
88
 
73
89
  # Found the start of algorithm marker
74
- elsif line =~ /^#{WHITESPACE}#{PRIMITIVE_START}/
90
+ elsif line =~ /^#{WHITESPACE}#{SPECIES_START}/
75
91
  if found == 0
76
92
  line = replace_defines(line,@defines)
77
93
  prefix, input, output = marker_to_algorithm(line)
@@ -80,28 +96,92 @@ module Bones
80
96
  @found_algorithms = @found_algorithms + 1
81
97
  end
82
98
  found = found + 1
99
+ #@target_code << "int bones_temp_species_start = '#{line.gsub(NL,'')}';"+NL
83
100
 
84
101
  # Found the end of algorithm marker
85
- elsif line =~ /^#{WHITESPACE}#{PRIMITIVE_END}/
102
+ elsif line =~ /^#{WHITESPACE}#{SPECIES_END}/
86
103
  if found == 1
87
- name = line.strip.scan(/^#{WHITESPACE}#{PRIMITIVE_END} (.+)/).join
104
+ name = line.strip.scan(/^#{WHITESPACE}#{SPECIES_END} (.+)/).join
88
105
  name = DEFAULT_NAME if name == ''
89
106
  @algorithms.push(Bones::Algorithm.new(name,@filename,index.to_s,species,algorithm_code))
90
107
  algorithm_code = ''
91
108
  end
92
109
  found = found - 1
110
+ #@target_code << "int bones_temp_species_end = '#{line.gsub(NL,'')}';"+NL
111
+
112
+ # Found a sync marker
113
+ elsif @scheduler && line =~ /^#{WHITESPACE}#{SYNC}/
114
+ sync = line.strip.scan(/^#{WHITESPACE}#{SYNC} (.+)/).join
115
+ @target_code << "bones_synchronize(#{sync});"+NL
116
+
117
+ # Found a copyin marker
118
+ elsif @scheduler && line =~ /^#{WHITESPACE}#{COPYIN}/
119
+ copies = line.strip.scan(/^#{WHITESPACE}#{COPYIN} (.+)/).join.split(WEDGE).map{ |c| c.strip }
120
+ copies.each_with_index do |copy,copynum|
121
+ name = copy.split('[').first
122
+ domain = copy.scan(/\[(.+)\]/).join.split(DIM_SEP)
123
+ deadline = copy.split('|').last
124
+ @copies.push(Bones::Copy.new(name,domain,deadline,'in',"#{index*100+copynum}"))
125
+ @target_code << "bones_copyin_#{index*100+copynum}_#{name}(#{name});"+NL
126
+ end
127
+
128
+ # Found a copyout marker
129
+ elsif @scheduler && line =~ /^#{WHITESPACE}#{COPYOUT}/
130
+ copies = line.strip.scan(/^#{WHITESPACE}#{COPYOUT} (.+)/).join.split(WEDGE).map{ |c| c.strip }
131
+ copies.each_with_index do |copy,copynum|
132
+ name = copy.split('[').first
133
+ domain = copy.scan(/\[(.+)\]/).join.split(DIM_SEP)
134
+ deadline = copy.split('|').last
135
+ @copies.push(Bones::Copy.new(name,domain,deadline,'out',"#{index*100+copynum}"))
136
+ @target_code << "bones_copyout_#{index*100+copynum}_#{name}(#{name});"+NL
137
+ end
138
+ end
139
+
140
+ # Check if it was a 'pragma scop' / 'pragma endscop' line
141
+ if line =~ /^#{WHITESPACE}#{SCOP_START}/
142
+ alloc_index = index
143
+ elsif line =~ /^#{WHITESPACE}#{SCOP_END}/
144
+ free_index = @target_code.length
93
145
  end
146
+
94
147
  else
95
148
  if found > 0
96
149
  algorithm_line = replace_defines(line,@defines)
97
- @target_code += algorithm_line
150
+ @target_code << algorithm_line
98
151
  algorithm_code += algorithm_line if line !~ /^#{WHITESPACE}#/
99
152
  else
100
- @target_code += line
153
+ @target_code << line
101
154
  end
102
155
  end
103
156
  end
104
- puts WARNING+'Begin/end kernel mismatch ('+@found_algorithms.to_s+' versus '+@algorithms.length.to_s+'), probably missing a "'+PRIMITIVE_END+'"' unless @algorithms.length == @found_algorithms
157
+ puts WARNING+'Begin/end kernel mismatch ('+@found_algorithms.to_s+' versus '+@algorithms.length.to_s+'), probably missing a "'+SPECIES_END+'"' unless @algorithms.length == @found_algorithms
158
+
159
+ # Add frees and mallocs
160
+ if @scheduler
161
+ alloc_code, free_code = '', ''
162
+ included_copies = []
163
+ copies.each do |copy|
164
+ if !included_copies.include?(copy.name)
165
+ alloc_code += copy.get_function_call('alloc')+NL
166
+ free_code += copy.get_function_call('free')+NL
167
+ included_copies << copy.name
168
+ end
169
+ end
170
+ end
171
+
172
+ # Add timers (whole scop timing) and frees/mallocs to the code
173
+ offset = @header_code.lines.count
174
+ @target_code.insert(alloc_index-offset, 'bones_timer_start();'+NL)
175
+ if @scheduler
176
+ @target_code.insert(alloc_index-offset+1, alloc_code)
177
+ @target_code.insert(free_index+2, free_code)
178
+ @target_code.insert(free_index+3, 'bones_timer_stop();'+NL)
179
+ else
180
+ @target_code.insert(free_index+2, 'bones_timer_stop();'+NL)
181
+ end
182
+
183
+ # Join the array
184
+ @target_code = @target_code.join('')
105
185
  end
106
186
 
107
187
  # This is the method to preprocess a header file. Currently,
@@ -143,7 +223,7 @@ module Bones
143
223
 
144
224
  # Method to extract the algorithm details from a marker found in code.
145
225
  def marker_to_algorithm(marker)
146
- algorithm = marker.strip.scan(/^#{WHITESPACE}#{PRIMITIVE_START} (.+)/).join
226
+ algorithm = marker.strip.scan(/^#{WHITESPACE}#{SPECIES_START} (.+)/).join
147
227
  prefix = ''
148
228
  if algorithm =~ REGEXP_PREFIX
149
229
  split = algorithm.partition(' ')