bones-compiler 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. checksums.yaml +15 -0
  2. data/CHANGELOG +37 -0
  3. data/LICENSE +1 -1
  4. data/README.rdoc +95 -70
  5. data/Rakefile +78 -3
  6. data/VERSION +1 -1
  7. data/bin/adarwin +17 -0
  8. data/examples/benchmarks/PolyBench/2mm.c +104 -0
  9. data/examples/benchmarks/{3mm.c → PolyBench/3mm.c} +5 -2
  10. data/examples/benchmarks/{adi.c → PolyBench/adi.c} +6 -3
  11. data/examples/benchmarks/{atax.c → PolyBench/atax.c} +5 -2
  12. data/examples/benchmarks/{bicg.c → PolyBench/bicg.c} +5 -2
  13. data/examples/benchmarks/{cholesky.c → PolyBench/cholesky.c} +3 -0
  14. data/examples/benchmarks/{common.h → PolyBench/common.h} +2 -2
  15. data/examples/benchmarks/{correlation.c → PolyBench/correlation.c} +16 -7
  16. data/examples/benchmarks/{covariance.c → PolyBench/covariance.c} +7 -2
  17. data/examples/benchmarks/{doitgen.c → PolyBench/doitgen.c} +5 -2
  18. data/examples/benchmarks/{durbin.c → PolyBench/durbin.c} +3 -0
  19. data/examples/benchmarks/{dynprog.c → PolyBench/dynprog.c} +3 -0
  20. data/examples/benchmarks/{fdtd-2d-apml.c → PolyBench/fdtd-2d-apml.c} +3 -0
  21. data/examples/benchmarks/{fdtd-2d.c → PolyBench/fdtd-2d.c} +5 -2
  22. data/examples/benchmarks/{floyd-warshall.c → PolyBench/floyd-warshall.c} +3 -0
  23. data/examples/benchmarks/{gemm.c → PolyBench/gemm.c} +5 -2
  24. data/examples/benchmarks/{gemver.c → PolyBench/gemver.c} +5 -2
  25. data/examples/benchmarks/{gesummv.c → PolyBench/gesummv.c} +5 -2
  26. data/examples/benchmarks/{gramschmidt.c → PolyBench/gramschmidt.c} +3 -0
  27. data/examples/benchmarks/{jacobi-1d-imper.c → PolyBench/jacobi-1d-imper.c} +10 -2
  28. data/examples/benchmarks/{jacobi-2d-imper.c → PolyBench/jacobi-2d-imper.c} +8 -3
  29. data/examples/benchmarks/{lu.c → PolyBench/lu.c} +3 -0
  30. data/examples/benchmarks/{ludcmp.c → PolyBench/ludcmp.c} +3 -0
  31. data/examples/benchmarks/{mvt.c → PolyBench/mvt.c} +6 -2
  32. data/examples/benchmarks/{reg_detect.c → PolyBench/reg_detect.c} +3 -0
  33. data/examples/benchmarks/{seidel-2d.c → PolyBench/seidel-2d.c} +3 -0
  34. data/examples/benchmarks/{symm.c → PolyBench/symm.c} +3 -0
  35. data/examples/benchmarks/{syr2k.c → PolyBench/syr2k.c} +5 -2
  36. data/examples/benchmarks/{syrk.c → PolyBench/syrk.c} +7 -4
  37. data/examples/benchmarks/{trisolv.c → PolyBench/trisolv.c} +3 -0
  38. data/examples/benchmarks/{trmm.c → PolyBench/trmm.c} +3 -0
  39. data/examples/benchmarks/Rodinia/cfd.c +180 -0
  40. data/examples/benchmarks/Rodinia/hotspot.c +228 -0
  41. data/examples/benchmarks/Rodinia/kmeans.c +164 -0
  42. data/examples/benchmarks/Rodinia/srad.c +188 -0
  43. data/examples/benchmarks/other/common.h +0 -0
  44. data/examples/benchmarks/other/dct.c +58 -0
  45. data/examples/benchmarks/other/mm.c +50 -0
  46. data/examples/benchmarks/{saxpy.c → other/saxpy.c} +11 -7
  47. data/examples/chunk/{example1.c → example01.c} +0 -0
  48. data/examples/chunk/{example2.c → example02.c} +0 -0
  49. data/examples/chunk/{example3.c → example03.c} +0 -0
  50. data/examples/chunk/{example4.c → example04.c} +0 -0
  51. data/examples/chunk/{example5.c → example05.c} +0 -0
  52. data/examples/chunk/example06.c +45 -0
  53. data/examples/chunk/example07.c +49 -0
  54. data/examples/dependences/example01.c +42 -0
  55. data/examples/dependences/example02.c +40 -0
  56. data/examples/dependences/example03.c +43 -0
  57. data/examples/dependences/example04.c +44 -0
  58. data/examples/dependences/example05.c +42 -0
  59. data/examples/element/{example1.c → example01.c} +0 -0
  60. data/examples/element/{example2.c → example02.c} +2 -2
  61. data/examples/element/{example3.c → example03.c} +0 -0
  62. data/examples/element/{example4.c → example04.c} +0 -0
  63. data/examples/element/{example5.c → example05.c} +0 -0
  64. data/examples/element/{example6.c → example06.c} +0 -0
  65. data/examples/element/{example7.c → example07.c} +0 -0
  66. data/examples/element/{example8.c → example08.c} +0 -0
  67. data/examples/element/{example9.c → example09.c} +0 -0
  68. data/examples/element/example13.c +73 -0
  69. data/examples/fusion/example01.c +68 -0
  70. data/examples/fusion/example02.c +73 -0
  71. data/examples/fusion/example03.c +72 -0
  72. data/examples/fusion/example04.c +61 -0
  73. data/examples/fusion/example05.c +55 -0
  74. data/examples/neighbourhood/{example1.c → example01.c} +0 -0
  75. data/examples/neighbourhood/{example2.c → example02.c} +0 -0
  76. data/examples/neighbourhood/{example3.c → example03.c} +0 -0
  77. data/examples/neighbourhood/{example4.c → example04.c} +0 -0
  78. data/examples/neighbourhood/example05.c +44 -0
  79. data/examples/shared/{example1.c → example01.c} +0 -0
  80. data/examples/shared/{example2.c → example02.c} +0 -0
  81. data/examples/shared/{example3.c → example03.c} +0 -0
  82. data/examples/shared/{example4.c → example04.c} +0 -0
  83. data/examples/shared/{example5.c → example05.c} +0 -0
  84. data/lib/adarwin.rb +62 -0
  85. data/lib/adarwin/dependences.rb +268 -0
  86. data/lib/adarwin/engine.rb +277 -0
  87. data/lib/adarwin/fusion.rb +174 -0
  88. data/lib/adarwin/interval.rb +57 -0
  89. data/lib/adarwin/memorycopies.rb +153 -0
  90. data/lib/adarwin/nest.rb +225 -0
  91. data/lib/adarwin/preprocessor.rb +76 -0
  92. data/lib/adarwin/reference.rb +261 -0
  93. data/lib/bones.rb +4 -55
  94. data/lib/bones/algorithm.rb +77 -40
  95. data/lib/bones/copy.rb +26 -0
  96. data/lib/bones/engine.rb +147 -31
  97. data/lib/bones/preprocessor.rb +92 -12
  98. data/lib/bones/species.rb +4 -3
  99. data/lib/bones/structure.rb +14 -4
  100. data/lib/castaddon.rb +11 -6
  101. data/lib/castaddon/node_adarwin.rb +245 -0
  102. data/lib/castaddon/node_bones.rb +316 -0
  103. data/lib/castaddon/node_common.rb +289 -0
  104. data/lib/castaddon/transformations.rb +236 -0
  105. data/lib/common.rb +216 -0
  106. data/skeletons/CPU-C/common/header.c +3 -0
  107. data/skeletons/CPU-C/common/mem_global.c +0 -0
  108. data/skeletons/CPU-C/common/timer_2_start.c +11 -13
  109. data/skeletons/CPU-C/common/timer_2_stop.c +1 -1
  110. data/skeletons/CPU-C/common/timer_globals.c +29 -0
  111. data/skeletons/CPU-OPENCL-INTEL/common/globals.c +1 -1
  112. data/skeletons/CPU-OPENCL-INTEL/common/header.c +3 -0
  113. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_D2H.c +7 -2
  114. data/skeletons/CPU-OPENCL-INTEL/common/mem_copy_H2D.c +4 -2
  115. data/skeletons/CPU-OPENCL-INTEL/common/mem_global.c +0 -0
  116. data/skeletons/CPU-OPENCL-INTEL/common/mem_prologue.c +6 -3
  117. data/skeletons/CPU-OPENCL-INTEL/common/timer_2_stop.c +1 -1
  118. data/skeletons/CPU-OPENCL-INTEL/common/timer_globals.c +24 -0
  119. data/skeletons/CPU-OPENMP/common/globals.c +1 -0
  120. data/skeletons/CPU-OPENMP/common/header.c +3 -0
  121. data/skeletons/CPU-OPENMP/common/mem_global.c +0 -0
  122. data/skeletons/CPU-OPENMP/common/timer_1_start.c +0 -12
  123. data/skeletons/CPU-OPENMP/common/timer_2_stop.c +1 -1
  124. data/skeletons/CPU-OPENMP/common/timer_globals.c +33 -0
  125. data/skeletons/GPU-CUDA/common/globals.c +27 -3
  126. data/skeletons/GPU-CUDA/common/header.c +2 -0
  127. data/skeletons/GPU-CUDA/common/mem_async_alloc.c +6 -0
  128. data/skeletons/GPU-CUDA/common/mem_async_copyin.c +6 -0
  129. data/skeletons/GPU-CUDA/common/mem_async_copyout.c +6 -0
  130. data/skeletons/GPU-CUDA/common/mem_async_free.c +6 -0
  131. data/skeletons/GPU-CUDA/common/mem_copy_D2H.c +2 -1
  132. data/skeletons/GPU-CUDA/common/mem_copy_H2D.c +2 -1
  133. data/skeletons/GPU-CUDA/common/mem_global.c +1 -0
  134. data/skeletons/GPU-CUDA/common/mem_prologue.c +1 -2
  135. data/skeletons/GPU-CUDA/common/scheduler.c +86 -0
  136. data/skeletons/GPU-CUDA/common/timer_2_start.c +2 -4
  137. data/skeletons/GPU-CUDA/common/timer_2_stop.c +3 -5
  138. data/skeletons/GPU-CUDA/common/timer_globals.c +26 -0
  139. data/skeletons/GPU-CUDA/kernel/2xN-N-chunk-1-N-to-D-element.kernel.cu +5 -7
  140. data/skeletons/GPU-CUDA/kernel/N-N-chunk-1-N-to-D-element.kernel.cu +4 -6
  141. data/skeletons/GPU-CUDA/kernel/default.host.c +1 -1
  142. data/skeletons/GPU-CUDA/kernel/default.kernel.cu +6 -8
  143. data/skeletons/GPU-CUDA/skeletons.txt +6 -5
  144. data/{examples/benchmarks/2mm.c → test/examples/benchmarks/PolyBench/2mm_species.c} +19 -15
  145. data/test/examples/benchmarks/PolyBench/3mm_species.c +82 -0
  146. data/test/examples/benchmarks/PolyBench/adi_species.c +89 -0
  147. data/test/examples/benchmarks/PolyBench/atax_species.c +69 -0
  148. data/test/examples/benchmarks/PolyBench/bicg_species.c +71 -0
  149. data/test/examples/benchmarks/PolyBench/cholesky_species.c +68 -0
  150. data/test/examples/benchmarks/PolyBench/correlation_species.c +97 -0
  151. data/test/examples/benchmarks/PolyBench/covariance_species.c +78 -0
  152. data/test/examples/benchmarks/PolyBench/doitgen_species.c +67 -0
  153. data/test/examples/benchmarks/PolyBench/durbin_species.c +80 -0
  154. data/test/examples/benchmarks/PolyBench/dynprog_species.c +71 -0
  155. data/test/examples/benchmarks/PolyBench/fdtd-2d-apml_species.c +112 -0
  156. data/test/examples/benchmarks/PolyBench/fdtd-2d_species.c +78 -0
  157. data/test/examples/benchmarks/PolyBench/floyd-warshall_species.c +54 -0
  158. data/test/examples/benchmarks/PolyBench/gemm_species.c +73 -0
  159. data/test/examples/benchmarks/PolyBench/gemver_species.c +93 -0
  160. data/test/examples/benchmarks/PolyBench/gesummv_species.c +68 -0
  161. data/test/examples/benchmarks/PolyBench/gramschmidt_species.c +78 -0
  162. data/test/examples/benchmarks/PolyBench/jacobi-1d-imper_species.c +59 -0
  163. data/test/examples/benchmarks/PolyBench/jacobi-2d-imper_species.c +65 -0
  164. data/test/examples/benchmarks/PolyBench/lu_species.c +57 -0
  165. data/test/examples/benchmarks/PolyBench/ludcmp_species.c +89 -0
  166. data/test/examples/benchmarks/PolyBench/mvt_species.c +69 -0
  167. data/test/examples/benchmarks/PolyBench/reg_detect_species.c +86 -0
  168. data/test/examples/benchmarks/PolyBench/seidel-2d_species.c +53 -0
  169. data/test/examples/benchmarks/PolyBench/symm_species.c +74 -0
  170. data/test/examples/benchmarks/PolyBench/syr2k_species.c +69 -0
  171. data/test/examples/benchmarks/PolyBench/syrk_species.c +66 -0
  172. data/test/examples/benchmarks/PolyBench/trisolv_species.c +61 -0
  173. data/test/examples/benchmarks/PolyBench/trmm_species.c +61 -0
  174. data/test/examples/chunk/example01_species.c +58 -0
  175. data/test/examples/chunk/example02_species.c +48 -0
  176. data/test/examples/chunk/example03_species.c +63 -0
  177. data/test/examples/chunk/example04_species.c +58 -0
  178. data/test/examples/chunk/example05_species.c +56 -0
  179. data/test/examples/chunk/example06_species.c +49 -0
  180. data/test/examples/chunk/example07_species.c +53 -0
  181. data/test/examples/dependences/example01_species.c +46 -0
  182. data/test/examples/dependences/example02_species.c +44 -0
  183. data/test/examples/dependences/example03_species.c +47 -0
  184. data/test/examples/dependences/example04_species.c +48 -0
  185. data/test/examples/dependences/example05_species.c +46 -0
  186. data/test/examples/element/example01_species.c +50 -0
  187. data/test/examples/element/example02_species.c +50 -0
  188. data/test/examples/element/example03_species.c +62 -0
  189. data/test/examples/element/example04_species.c +53 -0
  190. data/test/examples/element/example05_species.c +59 -0
  191. data/test/examples/element/example06_species.c +50 -0
  192. data/test/examples/element/example07_species.c +58 -0
  193. data/test/examples/element/example08_species.c +49 -0
  194. data/test/examples/element/example09_species.c +52 -0
  195. data/test/examples/element/example10_species.c +54 -0
  196. data/test/examples/element/example11_species.c +51 -0
  197. data/test/examples/element/example12_species.c +60 -0
  198. data/test/examples/element/example13_species.c +77 -0
  199. data/test/examples/neighbourhood/example01_species.c +57 -0
  200. data/test/examples/neighbourhood/example02_species.c +56 -0
  201. data/test/examples/neighbourhood/example03_species.c +83 -0
  202. data/test/examples/neighbourhood/example04_species.c +55 -0
  203. data/test/examples/neighbourhood/example05_species.c +48 -0
  204. data/test/examples/shared/example01_species.c +49 -0
  205. data/test/examples/shared/example02_species.c +55 -0
  206. data/test/examples/shared/example03_species.c +59 -0
  207. data/test/examples/shared/example04_species.c +56 -0
  208. data/test/examples/shared/example05_species.c +52 -0
  209. metadata +193 -73
  210. data/examples/benchmarks/overview.txt +0 -38
  211. data/lib/castaddon/node.rb +0 -753
data/lib/bones/copy.rb ADDED
@@ -0,0 +1,26 @@
1
+
2
+ module Bones
3
+
4
+ # Class copyin/out
5
+ class Copy
6
+ attr_accessor :name, :domain, :deadline, :direction, :id
7
+
8
+ def initialize(name,domain,deadline,direction,id)
9
+ @name = name
10
+ @domain = domain
11
+ @deadline = deadline
12
+ @direction = direction
13
+ @id = id
14
+ end
15
+
16
+ def get_definition(array_definition,type)
17
+ array_definition = '' if type == 'free' || type == 'alloc'
18
+ 'void bones_'+type+'_'+@id+'_'+@name+'('+array_definition+');'
19
+ end
20
+
21
+ def get_function_call(type)
22
+ 'bones_'+type+'_'+@id+'_'+@name+'();'
23
+ end
24
+ end
25
+
26
+ end
data/lib/bones/engine.rb CHANGED
@@ -17,13 +17,17 @@ module Bones
17
17
  # A list of timer files to be found in the skeleton library.
18
18
  TIMER_FILES = ['timer_1_start','timer_1_stop','timer_2_start','timer_2_stop']
19
19
  # A list of files to be found in the common directory of the skeleton library (excluding timer files).
20
- COMMON_FILES = ['prologue','epilogue','mem_prologue','mem_copy_H2D','mem_copy_D2H','mem_epilogue']
20
+ COMMON_FILES = ['prologue','epilogue','mem_prologue','mem_copy_H2D','mem_copy_D2H','mem_epilogue','mem_global']
21
21
  # The name of the file containing the globals as found in the skeleton library
22
22
  COMMON_GLOBALS = 'globals'
23
23
  # The name of the file containing the header file for the original C code as found in the skeleton library
24
24
  COMMON_HEADER = 'header'
25
25
  # The name of the file containing the globals for the kernel files as found in the skeleton library
26
26
  COMMON_GLOBALS_KERNEL = 'globals_kernel'
27
+ # The name of the file containing the scheduler code
28
+ COMMON_SCHEDULER = 'scheduler'
29
+ # Global timers
30
+ GLOBAL_TIMERS = 'timer_globals'
27
31
 
28
32
  # The extension of a host file in the skeleton library. See also SKELETON_DEVICE.
29
33
  SKELETON_HOST = '.host'
@@ -54,13 +58,15 @@ module Bones
54
58
  # --help, -h: Show this message
55
59
  #
56
60
  def initialize
57
- @result = {:original_code => [],
58
- :header_code => [],
59
- :host_declarations => [],
60
- :host_code_lists => [],
61
- :algorithm_declarations => [],
62
- :algorithm_code_lists => [],
63
- :verify_code => []}
61
+ @result = {:original_code => [],
62
+ :header_code => [],
63
+ :host_declarations => [],
64
+ :host_code_lists => [],
65
+ :algorithm_declarations => [],
66
+ :algorithm_code_lists => [],
67
+ :verify_code => [],
68
+ :host_device_mem_globals => []}
69
+ @state = 0
64
70
 
65
71
  # Provides a list of possible targets (e.g. GPU-CUDA, 'CPU-OPENCL-INTEL').
66
72
  targets = []
@@ -86,6 +92,9 @@ module Bones
86
92
  opt :verify, 'Verify correctness of the generated code', :short => 'c', :default => false
87
93
  opt :only_alg_number, 'Only generate code for the x-th species (99 -> all)', :short => 'o', :type => Integer, :default => 99
88
94
  opt :merge_factor, 'Thread merge factor, default is 1 (==disabled)', :short => 'f', :type => Integer, :default => 1
95
+ opt :register_caching,'Enable register caching: 1:enabled (default), 0:disabled', :short => 'r', :type => Integer, :default => 1
96
+ opt :zero_copy ,'Enable OpenCL zero-copy: 1:enabled (default), 0:disabled', :short => 'z', :type => Integer, :default => 1
97
+ opt :skeletons ,'Enable non-default skeletons: 1:enabled (default), 0:disabled', :short => 's', :type => Integer, :default => 1
89
98
  end
90
99
  Trollop::die 'no input file supplied (use: --application)' if !@options[:application_given]
91
100
  Trollop::die 'no target supplied (use: --target)' if !@options[:target_given]
@@ -103,6 +112,12 @@ module Bones
103
112
  # Set a prefix for functions called from the original file but defined in a host file
104
113
  @prefix = (@options[:target] == 'GPU-CUDA') ? '' : ''
105
114
 
115
+ # Setting to include the scheduler (CUDA only)
116
+ @scheduler = (@options[:target] == 'GPU-CUDA') ? true : false
117
+
118
+ # Skip analyse passes for certain targets
119
+ @skiptarget = false #(@options[:target] == 'PAR4ALL') ? true : false
120
+
106
121
  # Set the location for the skeleton library
107
122
  @dir = {}
108
123
  @dir[:library] = File.join(BONES_DIR_SKELETONS,@options[:target])
@@ -125,7 +140,7 @@ module Bones
125
140
  def process
126
141
 
127
142
  # Run the preprocessor
128
- preprocessor = Bones::Preprocessor.new(@source,File.dirname(@options[:application]),@basename)
143
+ preprocessor = Bones::Preprocessor.new(@source,File.dirname(@options[:application]),@basename,@scheduler)
129
144
  preprocessor.process
130
145
  @result[:header_code] = preprocessor.header_code
131
146
  @result[:device_header] = preprocessor.device_header
@@ -137,11 +152,20 @@ module Bones
137
152
  parser.type_names << 'size_t'
138
153
  ast = parser.parse(preprocessor.target_code)
139
154
  ast.preprocess
155
+
156
+ # Add the scheduler's global code
157
+ if @scheduler
158
+ @result[:host_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_SCHEDULER+@extension)))
159
+ end
140
160
 
141
161
  # Set the algorithm's skeleton and generate the global code
142
162
  one_time = true
143
163
  preprocessor.algorithms.each_with_index do |algorithm,algorithm_number|
144
164
  algorithm.species.set_skeleton(File.join(@dir[:library],SKELETON_FILE))
165
+ if @options[:skeletons] == 0
166
+ algorithm.species.skeleton_name = 'default'
167
+ algorithm.species.settings.gsub!('10','00').gsub!('20','00').gsub!('30','00')
168
+ end
145
169
  if algorithm.species.skeleton_name && one_time
146
170
  @result[:host_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_GLOBALS+@extension)))
147
171
  @result[:algorithm_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_GLOBALS_KERNEL+@extension)))
@@ -149,24 +173,60 @@ module Bones
149
173
  end
150
174
  end
151
175
 
152
- # Perform code generation
176
+ # Perform code generation (per-species code)
153
177
  @result[:original_code] = ast
178
+ arrays = []
154
179
  preprocessor.algorithms.each_with_index do |algorithm,algorithm_number|
155
180
  if @options[:only_alg_number] == 99 || algorithm_number == [@options[:only_alg_number],preprocessor.algorithms.length-1].min
156
181
  puts MESSAGE+'Starting code generation for algorithm "'+algorithm.name+'"'
157
182
  if algorithm.species.skeleton_name
158
183
  algorithm.merge_factor = @options[:merge_factor] if (@options[:target] == 'GPU-CUDA')
184
+ algorithm.register_caching_enabled = @options[:register_caching]
159
185
  algorithm.set_function(ast)
160
- algorithm.populate_variables(ast,preprocessor.defines)
186
+ algorithm.populate_variables(ast,preprocessor.defines) if !@skiptarget
161
187
  algorithm.populate_lists()
162
- algorithm.populate_hash()
188
+ algorithm.populate_hash() if !@skiptarget
163
189
  generate(algorithm)
164
190
  puts MESSAGE+'Code generated using the "'+algorithm.species.skeleton_name+'" skeleton'
191
+ arrays.concat(algorithm.arrays)
165
192
  else
166
193
  puts WARNING+'Skeleton "'+algorithm.species.name+'" not available'
167
194
  end
168
195
  end
169
196
  end
197
+
198
+ # Only if the scheduler is included
199
+ if @scheduler
200
+
201
+ # Perform code generation (sync statements)
202
+ @result[:host_declarations].push('void bones_synchronize(int bones_task_id);')
203
+
204
+ # Perform code generation (memory allocs)
205
+ allocs = []
206
+ preprocessor.copies.each do |copy|
207
+ if !allocs.include?(copy.name)
208
+ generate_memory('alloc',copy,arrays,0)
209
+ allocs << copy.name
210
+ end
211
+ end
212
+
213
+ # Perform code generation (memory copies)
214
+ preprocessor.copies.each_with_index do |copy,index|
215
+ #puts MESSAGE+'Generating copy code for array "'+copy.name+'"'
216
+ generate_memory('copy',copy,arrays,index)
217
+ end
218
+
219
+ # Perform code generation (memory frees)
220
+ frees = []
221
+ preprocessor.copies.each do |copy|
222
+ if !frees.include?(copy.name)
223
+ generate_memory('free',copy,arrays,0)
224
+ frees << copy.name
225
+ end
226
+ end
227
+
228
+ end
229
+
170
230
  end
171
231
 
172
232
  # This method writes the output code to files. It creates
@@ -202,7 +262,7 @@ module Bones
202
262
  end
203
263
  end
204
264
 
205
- # Populate the verification file4
265
+ # Populate the verification file
206
266
  if @options[:verify]
207
267
  File.open(File.join(directory,@options[:name]+OUTPUT_VERIFICATION+@extension),'w') do |verification|
208
268
  verification.puts @result[:header_code]
@@ -212,15 +272,22 @@ module Bones
212
272
  end
213
273
  end
214
274
 
215
- # Populate the target file
275
+ # Populate the target file (host)
216
276
  File.open(File.join(directory,@options[:name]+OUTPUT_HOST+@extension),'w') do |target|
277
+ target.puts '#include <cuda_runtime.h>'+NL if @options[:target] == 'GPU-CUDA'
278
+ target.puts "#define ZEROCOPY 0"+NL if @options[:zero_copy] == 0 && @options[:target] == 'CPU-OPENCL-INTEL'
279
+ target.puts "#define ZEROCOPY 1"+NL if @options[:zero_copy] == 1 && @options[:target] == 'CPU-OPENCL-INTEL'
217
280
  target.puts @result[:header_code]
218
- target.puts @result[:algorithm_declarations]
219
281
  target.puts
282
+ target.puts @result[:host_device_mem_globals]
283
+ target.puts
284
+ target.puts @result[:algorithm_declarations]
220
285
  target.puts @result[:host_code_lists]
286
+ target.puts
287
+ target.puts File.read(File.join(@dir[:common_library],GLOBAL_TIMERS+@extension))
221
288
  end
222
289
 
223
- # Populate the algorithm file
290
+ # Populate the algorithm file (device)
224
291
  File.open(File.join(directory,@options[:name]+OUTPUT_DEVICE+@algorithm_extension),'w') do |algorithm|
225
292
  algorithm.puts @result[:device_header]
226
293
  algorithm.puts @result[:algorithm_code_lists]
@@ -251,7 +318,7 @@ module Bones
251
318
  :device => File.read(file_name_device+@algorithm_extension)}
252
319
 
253
320
  # Perform the transformations on the algorithm's code
254
- algorithm.perform_transformations(algorithm.species.settings)
321
+ algorithm.perform_transformations(algorithm.species.settings) if !@skiptarget
255
322
 
256
323
  # Load the common skeletons from the skeleton library
257
324
  COMMON_FILES.each do |skeleton|
@@ -291,13 +358,19 @@ module Bones
291
358
  minihash = { :array => array.name,
292
359
  :type => array.type_name,
293
360
  :flatten => array.flatten,
294
- :variable_dimensions => array.size.join('*')}
361
+ :variable_dimensions => array.size.join('*'),
362
+ :state => @state.to_s}
363
+ @state += 1
295
364
 
296
365
  # Apply the mini-search-and-replace hash to create the memory allocations, memory copies (if input only), etc.
297
366
  processed[:mem_prologue] += search_and_replace(minihash,skeletons[:mem_prologue])
298
367
  processed[:mem_copy_H2D] += search_and_replace(minihash,skeletons[:mem_copy_H2D]) if array.input? || array.species.shared?
299
368
  processed[:mem_epilogue] += search_and_replace(minihash,skeletons[:mem_epilogue])
369
+
370
+ # Add the device declarations
371
+ @result[:host_device_mem_globals].push(search_and_replace(minihash,skeletons[:mem_global]))
300
372
  end
373
+
301
374
  # Iterate over all the array variables and create a mini-search-and-replace hash for each array (output arrays)
302
375
  algorithm.arrays.select(OUTPUT).each_with_index do |array, num_array|
303
376
  hash = algorithm.hash["out#{num_array}".to_sym]
@@ -305,7 +378,9 @@ module Bones
305
378
  :type => array.type_name,
306
379
  :flatten => array.flatten,
307
380
  :offset => '('+hash[:dimension0][:from]+')',
308
- :variable_dimensions => '('+hash[:dimensions]+')'}
381
+ :variable_dimensions => '('+hash[:dimensions]+')',
382
+ :state => @state.to_s}
383
+ @state += 1
309
384
 
310
385
  # Perform selective copy for arrays with 2 dimensions (uses a for-loop over the memory copies)
311
386
  if array.dimensions == 2 && @options[:target] == 'GPU-CUDA' && false
@@ -346,11 +421,17 @@ module Bones
346
421
  search_and_replace!(algorithm.hash,skeletons[:epilogue])
347
422
 
348
423
  # Construct the final host function, inluding the timers and memory copies
349
- host = skeletons[:prologue ] + skeletons[:timer_1_start] +
350
- processed[:mem_prologue ] + processed[:mem_copy_H2D ] +
351
- skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] +
352
- processed[:mem_copy_D2H ] + processed[:mem_epilogue ] +
353
- skeletons[:timer_1_stop ] + skeletons[:epilogue ]
424
+ if @scheduler
425
+ host = skeletons[:prologue ] +
426
+ skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] +
427
+ skeletons[:epilogue ]
428
+ else
429
+ host = skeletons[:prologue ] +
430
+ skeletons[:timer_1_start] + processed[:mem_prologue ] + processed[:mem_copy_H2D ] +
431
+ skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] +
432
+ processed[:mem_copy_D2H ] + processed[:mem_epilogue ] + skeletons[:timer_1_stop ] +
433
+ skeletons[:epilogue ]
434
+ end
354
435
 
355
436
  # Generate code to replace the original code, including verification code if specified by the option flag
356
437
  verify_skeleton = File.read(File.join(@dir[:verify_library],'verify_results.c'))
@@ -362,24 +443,59 @@ module Bones
362
443
  # Add a performance model to the original code
363
444
  #replacement_code.insert(0,algorithm.performance_model_code('model'))
364
445
 
365
- # Replace mallocs and frees in the original code with aligned memory allocations (only for CPU-OpenCL targets)
366
- if @options[:target] == 'CPU-OPENCL-INTEL'
367
- @result[:original_code].seach_and_replace_function_call(C::Variable.parse('malloc'),C::Variable.parse(VARIABLE_PREFIX+'malloc_128'))
368
- @result[:original_code].seach_and_replace_function_call(C::Variable.parse('free'),C::Variable.parse(VARIABLE_PREFIX+'free_128'))
446
+ # Replace mallocs and frees in the original code with aligned memory allocations (only for CPU-OpenCL targets with zero-copy)
447
+ if @options[:zero_copy] == 1 && @options[:target] == 'CPU-OPENCL-INTEL'
448
+ @result[:original_code].search_and_replace_function_call(C::Variable.parse('malloc'),C::Variable.parse(VARIABLE_PREFIX+'malloc_128'))
449
+ @result[:original_code].search_and_replace_function_call(C::Variable.parse('free'),C::Variable.parse(VARIABLE_PREFIX+'free_128'))
369
450
  end
370
451
 
371
452
  # Give the original main function a new name
372
- @result[:original_code].seach_and_replace_function_definition('main',VARIABLE_PREFIX+'main')
453
+ @result[:original_code].search_and_replace_function_definition('main',VARIABLE_PREFIX+'main')
373
454
 
374
455
  # Replace the original code with a function call to the newly generated code
375
- @result[:original_code].seach_and_replace_node(algorithm.code,replacement_code)
456
+ @result[:original_code].search_and_replace_node(algorithm.code,replacement_code)
376
457
 
377
458
  # The host code is generated, push the data to the output hashes
378
459
  accelerated_definition = 'void '+algorithm.name+'_accelerated('+algorithm.lists[:host_definition]+')'
379
- @result[:host_code_lists].push(@prefix+accelerated_definition+' {'+NL+host+NL+'}')
460
+ @result[:host_code_lists].push(@prefix+accelerated_definition+' {'+NL+host+NL+'}'+NL+NL)
380
461
  @result[:host_declarations].push(@prefix+accelerated_definition+';'+NL+@prefix+original_definition+';')
381
462
  end
382
463
 
464
+
465
+ def generate_memory(type,copy,arrays,index)
466
+
467
+ # Find the corresponding array
468
+ arrays.each do |array|
469
+ if array.name == copy.name && (array.direction == copy.direction || array.direction == INOUT)
470
+
471
+ # Load the skeleton from the skeleton library
472
+ type += copy.direction if type == 'copy'
473
+ skeleton = File.read(File.join(@dir[:common_library],'mem_async_'+type+@extension))
474
+
475
+ # Create the find-and-replace hash
476
+ minihash = { :array => copy.name,
477
+ :id => copy.id,
478
+ :index => index.to_s,
479
+ :direction => copy.direction,
480
+ :definition => array.definition,
481
+ :type => array.type_name,
482
+ :flatten => array.flatten,
483
+ :offset => '0',
484
+ :variable_dimensions => array.size.join('*'),
485
+ :state => copy.deadline}
486
+
487
+ # Instanstiate the skeleton and add it to the final result
488
+ @result[:host_code_lists].push(search_and_replace(minihash,skeleton))
489
+
490
+ # Add a forward declaration of this function
491
+ @result[:host_declarations].push(copy.get_definition(array.definition,type))
492
+
493
+ # Done
494
+ return
495
+ end
496
+ end
497
+ end
498
+
383
499
  end
384
500
 
385
501
  end
@@ -9,7 +9,7 @@ module Bones
9
9
  # * +algorithms+ - An array of identified algorithms, each of class Bones::Algorithm.
10
10
  # * +target_code+ - The processed code containing no Bones directives nor other pre-processor directives (such as includes and defines).
11
11
  class Preprocessor < Common
12
- attr_reader :header_code, :algorithms, :target_code, :device_header, :defines
12
+ attr_reader :header_code, :algorithms, :target_code, :device_header, :defines, :scop, :copies
13
13
 
14
14
  # Denotes the start of an algorithmic species.
15
15
  IDENTIFIER = '#pragma species'
@@ -18,9 +18,22 @@ module Bones
18
18
  WHITESPACE = '\s*'
19
19
 
20
20
  # This directive denotes the start of a algorithm. It is based on the IDENTIFIER constant.
21
- PRIMITIVE_START = IDENTIFIER+' kernel'
21
+ SPECIES_START = IDENTIFIER+' kernel'
22
22
  # This directive denotes the end of a algorithm. It is based on the IDENTIFIER constant.
23
- PRIMITIVE_END = IDENTIFIER+' endkernel'
23
+ SPECIES_END = IDENTIFIER+' endkernel'
24
+
25
+ # Start of the scop
26
+ SCOP_START = '#pragma scop'
27
+ # Enf of the scop
28
+ SCOP_END = '#pragma endscop'
29
+
30
+ # Synchronise directive.
31
+ SYNC = IDENTIFIER+' sync'
32
+
33
+ # Copy in directive.
34
+ COPYIN = IDENTIFIER+ ' copyin'
35
+ # Copy out directive.
36
+ COPYOUT = IDENTIFIER+ ' copyout'
24
37
 
25
38
  # A regular expression captures a prefix in a algorithm (e.g. unordered/multiple).
26
39
  REGEXP_PREFIX = /^[a-z]+ /
@@ -31,16 +44,18 @@ module Bones
31
44
  # This is the method which initializes the preprocessor.
32
45
  # Initialization requires the target source code to process,
33
46
  # which is then set as the class variable +@source_code+.
34
- def initialize(source_code,directory,filename)
47
+ def initialize(source_code,directory,filename,scheduler)
35
48
  @source_code = source_code
36
- @target_code = ''
49
+ @target_code = []
37
50
  @header_code = ''
38
51
  @device_header = ''
39
52
  @directory = directory
40
53
  @filename = filename
41
54
  @algorithms = Array.new
55
+ @copies = Array.new
42
56
  @defines = {}
43
57
  @found_algorithms = 0
58
+ @scheduler = scheduler
44
59
  end
45
60
 
46
61
  # This is the method to perform the actual preprocessing.
@@ -51,6 +66,7 @@ module Bones
51
66
  algorithm_code = ''
52
67
  species = nil
53
68
  found = 0
69
+ alloc_index, free_index = 0, 0
54
70
 
55
71
  # Process the file line by line
56
72
  @source_code.each_line.with_index do |line,index|
@@ -71,7 +87,7 @@ module Bones
71
87
  @defines[match.first[0].to_sym] = match.first[1]
72
88
 
73
89
  # Found the start of algorithm marker
74
- elsif line =~ /^#{WHITESPACE}#{PRIMITIVE_START}/
90
+ elsif line =~ /^#{WHITESPACE}#{SPECIES_START}/
75
91
  if found == 0
76
92
  line = replace_defines(line,@defines)
77
93
  prefix, input, output = marker_to_algorithm(line)
@@ -80,28 +96,92 @@ module Bones
80
96
  @found_algorithms = @found_algorithms + 1
81
97
  end
82
98
  found = found + 1
99
+ #@target_code << "int bones_temp_species_start = '#{line.gsub(NL,'')}';"+NL
83
100
 
84
101
  # Found the end of algorithm marker
85
- elsif line =~ /^#{WHITESPACE}#{PRIMITIVE_END}/
102
+ elsif line =~ /^#{WHITESPACE}#{SPECIES_END}/
86
103
  if found == 1
87
- name = line.strip.scan(/^#{WHITESPACE}#{PRIMITIVE_END} (.+)/).join
104
+ name = line.strip.scan(/^#{WHITESPACE}#{SPECIES_END} (.+)/).join
88
105
  name = DEFAULT_NAME if name == ''
89
106
  @algorithms.push(Bones::Algorithm.new(name,@filename,index.to_s,species,algorithm_code))
90
107
  algorithm_code = ''
91
108
  end
92
109
  found = found - 1
110
+ #@target_code << "int bones_temp_species_end = '#{line.gsub(NL,'')}';"+NL
111
+
112
+ # Found a sync marker
113
+ elsif @scheduler && line =~ /^#{WHITESPACE}#{SYNC}/
114
+ sync = line.strip.scan(/^#{WHITESPACE}#{SYNC} (.+)/).join
115
+ @target_code << "bones_synchronize(#{sync});"+NL
116
+
117
+ # Found a copyin marker
118
+ elsif @scheduler && line =~ /^#{WHITESPACE}#{COPYIN}/
119
+ copies = line.strip.scan(/^#{WHITESPACE}#{COPYIN} (.+)/).join.split(WEDGE).map{ |c| c.strip }
120
+ copies.each_with_index do |copy,copynum|
121
+ name = copy.split('[').first
122
+ domain = copy.scan(/\[(.+)\]/).join.split(DIM_SEP)
123
+ deadline = copy.split('|').last
124
+ @copies.push(Bones::Copy.new(name,domain,deadline,'in',"#{index*100+copynum}"))
125
+ @target_code << "bones_copyin_#{index*100+copynum}_#{name}(#{name});"+NL
126
+ end
127
+
128
+ # Found a copyout marker
129
+ elsif @scheduler && line =~ /^#{WHITESPACE}#{COPYOUT}/
130
+ copies = line.strip.scan(/^#{WHITESPACE}#{COPYOUT} (.+)/).join.split(WEDGE).map{ |c| c.strip }
131
+ copies.each_with_index do |copy,copynum|
132
+ name = copy.split('[').first
133
+ domain = copy.scan(/\[(.+)\]/).join.split(DIM_SEP)
134
+ deadline = copy.split('|').last
135
+ @copies.push(Bones::Copy.new(name,domain,deadline,'out',"#{index*100+copynum}"))
136
+ @target_code << "bones_copyout_#{index*100+copynum}_#{name}(#{name});"+NL
137
+ end
138
+ end
139
+
140
+ # Check if it was a 'pragma scop' / 'pragma endscop' line
141
+ if line =~ /^#{WHITESPACE}#{SCOP_START}/
142
+ alloc_index = index
143
+ elsif line =~ /^#{WHITESPACE}#{SCOP_END}/
144
+ free_index = @target_code.length
93
145
  end
146
+
94
147
  else
95
148
  if found > 0
96
149
  algorithm_line = replace_defines(line,@defines)
97
- @target_code += algorithm_line
150
+ @target_code << algorithm_line
98
151
  algorithm_code += algorithm_line if line !~ /^#{WHITESPACE}#/
99
152
  else
100
- @target_code += line
153
+ @target_code << line
101
154
  end
102
155
  end
103
156
  end
104
- puts WARNING+'Begin/end kernel mismatch ('+@found_algorithms.to_s+' versus '+@algorithms.length.to_s+'), probably missing a "'+PRIMITIVE_END+'"' unless @algorithms.length == @found_algorithms
157
+ puts WARNING+'Begin/end kernel mismatch ('+@found_algorithms.to_s+' versus '+@algorithms.length.to_s+'), probably missing a "'+SPECIES_END+'"' unless @algorithms.length == @found_algorithms
158
+
159
+ # Add frees and mallocs
160
+ if @scheduler
161
+ alloc_code, free_code = '', ''
162
+ included_copies = []
163
+ copies.each do |copy|
164
+ if !included_copies.include?(copy.name)
165
+ alloc_code += copy.get_function_call('alloc')+NL
166
+ free_code += copy.get_function_call('free')+NL
167
+ included_copies << copy.name
168
+ end
169
+ end
170
+ end
171
+
172
+ # Add timers (whole scop timing) and frees/mallocs to the code
173
+ offset = @header_code.lines.count
174
+ @target_code.insert(alloc_index-offset, 'bones_timer_start();'+NL)
175
+ if @scheduler
176
+ @target_code.insert(alloc_index-offset+1, alloc_code)
177
+ @target_code.insert(free_index+2, free_code)
178
+ @target_code.insert(free_index+3, 'bones_timer_stop();'+NL)
179
+ else
180
+ @target_code.insert(free_index+2, 'bones_timer_stop();'+NL)
181
+ end
182
+
183
+ # Join the array
184
+ @target_code = @target_code.join('')
105
185
  end
106
186
 
107
187
  # This is the method to preprocess a header file. Currently,
@@ -143,7 +223,7 @@ module Bones
143
223
 
144
224
  # Method to extract the algorithm details from a marker found in code.
145
225
  def marker_to_algorithm(marker)
146
- algorithm = marker.strip.scan(/^#{WHITESPACE}#{PRIMITIVE_START} (.+)/).join
226
+ algorithm = marker.strip.scan(/^#{WHITESPACE}#{SPECIES_START} (.+)/).join
147
227
  prefix = ''
148
228
  if algorithm =~ REGEXP_PREFIX
149
229
  split = algorithm.partition(' ')