ikra 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ast/builder.rb +225 -77
  3. data/lib/ast/host_section_builder.rb +38 -0
  4. data/lib/ast/interpreter.rb +67 -0
  5. data/lib/ast/lexical_variables_enumerator.rb +3 -2
  6. data/lib/ast/nodes.rb +521 -31
  7. data/lib/ast/printer.rb +116 -18
  8. data/lib/ast/ssa_generator.rb +192 -0
  9. data/lib/ast/visitor.rb +235 -21
  10. data/lib/config/configuration.rb +28 -3
  11. data/lib/config/os_configuration.rb +62 -9
  12. data/lib/cpu/cpu_implementation.rb +39 -0
  13. data/lib/ikra.rb +13 -3
  14. data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
  15. data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
  16. data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
  17. data/lib/resources/cuda/ast/assignment.cpp +1 -0
  18. data/lib/resources/cuda/block_function_head.cpp +7 -1
  19. data/lib/resources/cuda/entry_point.cpp +47 -0
  20. data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
  21. data/lib/resources/cuda/free_device_memory.cpp +3 -0
  22. data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
  23. data/lib/resources/cuda/header.cpp +23 -9
  24. data/lib/resources/cuda/header_structs.cpp +92 -0
  25. data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
  26. data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
  27. data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
  28. data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
  29. data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
  30. data/lib/resources/cuda/kernel.cpp +9 -2
  31. data/lib/resources/cuda/launch_kernel.cpp +5 -0
  32. data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
  33. data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
  34. data/lib/resources/cuda/reduce_body.cpp +88 -0
  35. data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
  36. data/lib/resources/cuda/stencil_body.cpp +16 -0
  37. data/lib/resources/cuda/struct_definition.cpp +4 -0
  38. data/lib/ruby_core/array.rb +34 -0
  39. data/lib/ruby_core/array_command.rb +313 -0
  40. data/lib/ruby_core/core.rb +103 -0
  41. data/lib/ruby_core/interpreter.rb +16 -0
  42. data/lib/ruby_core/math.rb +32 -0
  43. data/lib/ruby_core/ruby_integration.rb +256 -0
  44. data/lib/symbolic/host_section.rb +115 -0
  45. data/lib/symbolic/input.rb +87 -0
  46. data/lib/symbolic/input_visitor.rb +68 -0
  47. data/lib/symbolic/symbolic.rb +793 -117
  48. data/lib/symbolic/visitor.rb +70 -8
  49. data/lib/translator/array_command_struct_builder.rb +163 -0
  50. data/lib/translator/ast_translator.rb +572 -0
  51. data/lib/translator/block_translator.rb +104 -48
  52. data/lib/translator/commands/array_combine_command.rb +41 -0
  53. data/lib/translator/commands/array_identity_command.rb +28 -0
  54. data/lib/translator/commands/array_index_command.rb +52 -0
  55. data/lib/translator/commands/array_reduce_command.rb +135 -0
  56. data/lib/translator/commands/array_stencil_command.rb +129 -0
  57. data/lib/translator/commands/array_zip_command.rb +30 -0
  58. data/lib/translator/commands/command_translator.rb +264 -0
  59. data/lib/translator/cuda_errors.rb +32 -0
  60. data/lib/translator/environment_builder.rb +263 -0
  61. data/lib/translator/host_section/array_host_section_command.rb +150 -0
  62. data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
  63. data/lib/translator/host_section/ast_translator.rb +14 -0
  64. data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
  65. data/lib/translator/host_section/program_builder.rb +89 -0
  66. data/lib/translator/input_translator.rb +226 -0
  67. data/lib/translator/kernel_builder.rb +137 -0
  68. data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
  69. data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
  70. data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
  71. data/lib/translator/last_returns_visitor.rb +19 -10
  72. data/lib/translator/program_builder.rb +197 -0
  73. data/lib/translator/program_launcher.rb +273 -0
  74. data/lib/translator/struct_type.rb +55 -0
  75. data/lib/translator/translator.rb +34 -11
  76. data/lib/translator/variable_classifier_visitor.rb +56 -0
  77. data/lib/types/inference/ast_inference.rb +586 -0
  78. data/lib/types/inference/clear_types_visitor.rb +11 -0
  79. data/lib/types/inference/command_inference.rb +101 -0
  80. data/lib/types/inference/input_inference.rb +62 -0
  81. data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
  82. data/lib/types/inference/ruby_extension.rb +35 -0
  83. data/lib/types/inference/symbol_table.rb +131 -0
  84. data/lib/types/types.rb +14 -0
  85. data/lib/types/types/array_command_type.rb +123 -0
  86. data/lib/types/types/array_type.rb +137 -0
  87. data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
  88. data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
  89. data/lib/types/types/ruby_type.rb +88 -0
  90. data/lib/types/types/struct_type.rb +179 -0
  91. data/lib/types/types/union_type.rb +239 -0
  92. metadata +160 -18
  93. data/lib/ast/method_definition.rb +0 -37
  94. data/lib/ast/translator.rb +0 -264
  95. data/lib/resources/cuda/kernel_launcher.cpp +0 -28
  96. data/lib/scope.rb +0 -166
  97. data/lib/translator/command_translator.rb +0 -421
  98. data/lib/translator/local_variables_enumerator.rb +0 -35
  99. data/lib/translator/method_translator.rb +0 -24
  100. data/lib/types/array_type.rb +0 -51
  101. data/lib/types/ruby_extension.rb +0 -67
  102. data/lib/types/ruby_type.rb +0 -45
  103. data/lib/types/type_inference.rb +0 -382
  104. data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,40 @@
1
+ module Ikra
2
+ module Translator
3
+ class CommandTranslator
4
+ class ForLoopKernelLauncher < KernelLauncher
5
+ def initialize(
6
+ kernel_builder:,
7
+ from_expr: "0",
8
+ to_expr:,
9
+ var_name: "i",
10
+ before_loop: "")
11
+
12
+ super(kernel_builder)
13
+ @from_expr = from_expr
14
+ @to_expr = to_expr
15
+ @var_name = var_name
16
+ @before_loop = before_loop
17
+ end
18
+
19
+ attr_reader :from_expr
20
+ attr_reader :to_expr
21
+ attr_reader :var_name
22
+ attr_reader :before_loop
23
+
24
+ def build_kernel_launcher
25
+ Log.info("Building for-loop kernel launcher")
26
+
27
+ assert_ready_to_build
28
+
29
+ result = before_loop + "\n"
30
+ result = result + "for (int #{var_name} = #{from_expr}; #{var_name} < #{to_expr}; #{var_name} ++)\n{"
31
+
32
+ result = result + super
33
+ result = result + "\n}\n"
34
+
35
+ return result
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,259 @@
1
+ module Ikra
2
+ module Translator
3
+ class CommandTranslator
4
+
5
+ # Builds the launch of the kernel. This class is responsible for generating the
6
+ # invocation of the kernel.
7
+ #
8
+ # For example:
9
+ # kernel<<<..., ...>>>(env, result, d_a, ...);
10
+ class KernelLauncher
11
+ class << self
12
+ # Debug flag only: Frees all input after launching kernel. This causes an
13
+ # error if data is used twice or kept (using the `keep` flag)
14
+ attr_accessor :debug_free_previous_input_immediately
15
+ end
16
+
17
+ attr_accessor :kernel_builder
18
+
19
+ # Additional parameters that this kernel should accept (to access the result
20
+ # of previous kernels)
21
+ attr_accessor :previous_kernel_input
22
+
23
+ # Additional parameters that this kernel should accept (to access the result
24
+ # of previous kernels)
25
+ attr_accessor :additional_arguments
26
+
27
+ # Number of threads (elements to be processed)
28
+ attr_accessor :num_threads
29
+
30
+ # Block/grid dimensions (should be 1D)
31
+ attr_accessor :grid_dim
32
+ attr_accessor :block_dim
33
+
34
+ # Whether the launch allocates new memory beforehand or uses previous memory
35
+ attr_accessor :reuse_memory
36
+
37
+ # Pointer to the resulting array (device memory)
38
+ attr_reader :kernel_result_var_name
39
+
40
+ # IDs and types of commands whose results are kept on the GPU
41
+ attr_accessor :cached_results
42
+
43
+ # IDs and types of commands that were previously computed and shall now be used in this kernel as input
44
+ attr_reader :previously_cached_results
45
+
46
+ def initialize(kernel_builder)
47
+ @kernel_builder = kernel_builder
48
+ @additional_arguments = []
49
+ @previous_kernel_input = []
50
+ @reuse_memory = false
51
+ @kernel_result_var_name = "_kernel_result_" + CommandTranslator.next_unique_id.to_s
52
+ @cached_results = {}
53
+ @previously_cached_results = {}
54
+ end
55
+
56
+ # Some of the values stored in `@additional_arguments` might be blocks, because
57
+ # not all information was known when adding something to that list. This method
58
+ # replaces those blocks (evaluates them) with actual strings, based on the command
59
+ # that is being launched.
60
+ def prepare_additional_args_for_launch(command)
61
+ @additional_arguments = @additional_arguments.map do |arg|
62
+ if arg.is_a?(String)
63
+ arg
64
+ else
65
+ arg.call(command)
66
+ end
67
+ end
68
+ end
69
+
70
+ def kernel_builders
71
+ # The program builder accesses kernel builders via kernel launchers through
72
+ # this method, because some specialized launchers might have multiple kernel
73
+ # builders.
74
+ return [kernel_builder]
75
+ end
76
+
77
+ # Adds command whose result will be kept on GPU
78
+ def add_cached_result(result_id, type)
79
+ @cached_results[result_id] = type
80
+ end
81
+
82
+ # Adds a previously computed result which will be used in this launche as input
83
+ def use_cached_result(result_id, type)
84
+ @previously_cached_results[result_id] = type
85
+ end
86
+
87
+ def reuse_memory!(parameter_name)
88
+ @reuse_memory = true
89
+ @kernel_result_var_name = parameter_name
90
+ end
91
+
92
+ def reuse_memory?
93
+ return @reuse_memory
94
+ end
95
+
96
+ def add_previous_kernel_parameter(parameter)
97
+ kernel_builder.add_previous_kernel_parameter(parameter)
98
+ end
99
+
100
+ # Add additional arguments to the kernel function that might be needed for some computations
101
+ def add_additional_arguments(*arguments)
102
+ @additional_arguments.push(*arguments)
103
+ end
104
+
105
+ # The result type of this kernel launcher. Same as the result type of its kernel
106
+ # builder.
107
+ def result_type
108
+ return kernel_builder.result_type
109
+ end
110
+
111
+ # The size of the result array is the number of threads.
112
+ def result_size
113
+ return num_threads
114
+ end
115
+
116
+ # Configures grid size and block size. Also sets number of threads.
117
+ def configure_grid(size, block_size: 256)
118
+ if block_size == nil
119
+ block_size = 256
120
+ end
121
+
122
+ if size.is_a?(Fixnum)
123
+ # Precompute constants
124
+ @grid_dim = [size.fdiv(block_size).ceil, 1].max.to_s
125
+ @block_dim = (size >= block_size ? block_size : size).to_s
126
+ @num_threads = size
127
+ else
128
+ if !size.is_a?(String)
129
+ raise AssertionError.new("Fixnum or String expected")
130
+ end
131
+
132
+ # Source code string determines the size
133
+ @grid_dim = "max((int) ceil(((float) #{size}) / #{block_size}), 1)"
134
+ @block_dim = "(#{size} >= #{block_size} ? #{block_size} : #{size})"
135
+ @num_threads = size
136
+ end
137
+ end
138
+
139
+ def assert_ready_to_build
140
+ required_values = [:num_threads, :grid_dim, :block_dim]
141
+
142
+ for selector in required_values
143
+ if send(selector) == nil
144
+ raise AssertionError.new(
145
+ "Not ready to build (KernelBuilder): #{selector} is not set")
146
+ end
147
+ end
148
+ end
149
+
150
+
151
+ # Build the code that launches this kernel. The generated code performs the
152
+ # following steps:
153
+ #
154
+ # 1. Allocate device memory for the result.
155
+ # 2. If result should be written back: Allocate host memory for the result.
156
+ # 3. Launch the kernel (+ error checking, synchronization)
157
+ # 4. If result should be written back: Copy result back to host memory.
158
+ def build_kernel_launcher
159
+
160
+ Log.info("Building kernel launcher")
161
+
162
+ assert_ready_to_build
163
+
164
+ result = ""
165
+ if !reuse_memory
166
+ # Allocate device memory for kernel result
167
+ result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
168
+ "name" => kernel_result_var_name,
169
+ "bytes" => "(sizeof(#{kernel_builder.result_type.to_c_type}) * #{num_threads})",
170
+ "type" => kernel_builder.result_type.to_c_type})
171
+ end
172
+
173
+ previously_cached_results.each do |result_id, type|
174
+ result = result + " #{type.to_c_type} *prev_" + result_id.to_s + " = (#{type.to_c_type} *) " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id.to_s + ";\n"
175
+ end
176
+
177
+ # Allocate device memory for cached results
178
+ cached_results.each do |result_id, type|
179
+ result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
180
+ "name" => Constants::RESULT_IDENTIFIER + result_id,
181
+ "bytes" => "(#{type.c_size} * #{num_threads})",
182
+ "type" => type.to_c_type})
183
+ end
184
+
185
+ # Build arguments
186
+ a_env = Constants::ENV_DEVICE_IDENTIFIER
187
+ a_result = kernel_result_var_name
188
+
189
+ previous_kernel_args = []
190
+ for var in kernel_builder.previous_kernel_input
191
+ previous_kernel_args.push(var.name.to_s)
192
+ end
193
+
194
+ a_cached_results = cached_results.map do |result_id, type|
195
+ Constants::RESULT_IDENTIFIER + result_id
196
+ end
197
+
198
+ if reuse_memory
199
+ previous_kernel_args[0] = a_result
200
+ end
201
+
202
+ arguments = ([a_env, num_threads, a_result] + a_cached_results + previous_kernel_args + additional_arguments).join(", ")
203
+
204
+ # Launch kernel
205
+ result = result + Translator.read_file(file_name: "launch_kernel.cpp", replacements: {
206
+ "kernel_name" => kernel_builder.kernel_name,
207
+ "arguments" => arguments,
208
+ "grid_dim" => grid_dim,
209
+ "block_dim" => block_dim})
210
+
211
+ # ---- DEBUG ONLY: Free input after computation so that we can process larger
212
+ # data sets in benchmarks without running out of memory
213
+ # TODO: Implement analysis and do this automatically
214
+ if KernelLauncher.debug_free_previous_input_immediately == true
215
+ for var in kernel_builder.previous_kernel_input
216
+ result = result + Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
217
+ "name" => var.name.to_s})
218
+ end
219
+ end
220
+ # ---- END DEBUG ONLY
221
+
222
+ cached_results.each do |result_id, type|
223
+ result = result + " " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id + " = " + Constants::RESULT_IDENTIFIER + result_id + ";\n"
224
+ end
225
+
226
+ return result
227
+ end
228
+
229
+ def build_device_memory_free
230
+ Log.info("Building kernel post-launch CUDA free")
231
+
232
+ assert_ready_to_build
233
+
234
+ if KernelLauncher.debug_free_previous_input_immediately == true
235
+ Log.warn("Debug flag set... Freeing input memory immediately and some memory not at all!")
236
+ return ""
237
+ end
238
+
239
+ return Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
240
+ "name" => kernel_result_var_name})
241
+ end
242
+
243
+ # Same as above, but also removes item from the list of allocated memory chunks.
244
+ def build_device_memory_free_in_host_section
245
+ Log.info("Building kernel post-launch CUDA free (host section")
246
+
247
+ assert_ready_to_build
248
+
249
+ return Translator.read_file(file_name: "host_section_free_device_memory.cpp", replacements: {
250
+ "name" => kernel_result_var_name})
251
+ end
252
+ end
253
+ end
254
+ end
255
+ end
256
+
257
+ require_relative "../kernel_builder"
258
+ require_relative "for_loop_kernel_launcher"
259
+ require_relative "while_loop_kernel_launcher"
@@ -0,0 +1,38 @@
1
+ module Ikra
2
+ module Translator
3
+ class CommandTranslator
4
+ class WhileLoopKernelLauncher < KernelLauncher
5
+ def initialize(
6
+ kernel_builder:,
7
+ condition:,
8
+ before_loop: "",
9
+ post_iteration: "")
10
+
11
+ super(kernel_builder)
12
+ @condition = condition
13
+ @before_loop = before_loop
14
+ @post_iteration = post_iteration
15
+ end
16
+
17
+ attr_reader :condition
18
+ attr_reader :before_loop
19
+ attr_reader :post_iteration
20
+
21
+ def build_kernel_launcher
22
+ Log.info("Building for-loop kernel launcher")
23
+
24
+ assert_ready_to_build
25
+
26
+ result = ""
27
+ result = result + before_loop + "\n"
28
+ result = result + "while (#{condition}) {\n"
29
+ result = result + super
30
+ result = result + "\n" + post_iteration
31
+ result = result + "\n}\n"
32
+
33
+ return result
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -3,37 +3,42 @@ require_relative "../ast/visitor"
3
3
 
4
4
  module Ikra
5
5
  module Translator
6
+ # Visitor that replaces implicit returns with explicit ones
6
7
  class LastStatementReturnsVisitor < AST::Visitor
7
8
  def visit_root_node(node)
8
- node.child.accept(self)
9
+ node.single_child.accept(self)
9
10
  end
10
11
 
11
12
  def visit_lvar_read_node(node)
12
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
13
+ process_node(node)
13
14
  end
14
15
 
15
16
  def visit_lvar_write_node(node)
16
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
17
+ process_node(node)
17
18
  end
18
19
 
19
20
  def visit_int_node(node)
20
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
21
+ process_node(node)
21
22
  end
22
23
 
23
24
  def visit_float_node(node)
24
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
25
+ process_node(node)
25
26
  end
26
27
 
27
28
  def visit_bool_node(node)
28
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
29
+ process_node(node)
30
+ end
31
+
32
+ def visit_nil_node(node)
33
+ process_node(node)
29
34
  end
30
35
 
31
36
  def visit_for_node(node)
32
- raise "Cannot handle for loop as return value"
37
+ raise NotImplementedError.new("Cannot handle for loop as return value")
33
38
  end
34
39
 
35
40
  def visit_break_node(node)
36
- raise "Break must not be a return value"
41
+ raise AssertionError.new("Break must not be a return value")
37
42
  end
38
43
 
39
44
  def visit_if_node(node)
@@ -46,11 +51,15 @@ module Ikra
46
51
  end
47
52
 
48
53
  def visit_send_node(node)
49
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
54
+ process_node(node)
50
55
  end
51
56
 
52
57
  def visit_return_node(node)
53
- raise "Function returns already"
58
+ # Do nothing
59
+ end
60
+
61
+ def process_node(node)
62
+ node.parent.replace_child(node, AST::ReturnNode.new(value: node))
54
63
  end
55
64
  end
56
65
  end
@@ -0,0 +1,197 @@
1
+ require "tempfile"
2
+ require "set"
3
+
4
+ module Ikra
5
+ module Translator
6
+ class CommandTranslator
7
+
8
+ # Builds the entire CUDA program. A CUDA program may consist of multiple kernels, but
9
+ # has at least one kernel. The generated code performs the following steps:
10
+ #
11
+ # 1. Insert header of CUDA file.
12
+ # 2. For every kernel: Build all methods, blocks, and kernels.
13
+ # 3. Build the program entry point (including kernel launchers).
14
+ class ProgramBuilder
15
+ attr_reader :environment_builder
16
+ attr_reader :kernel_launchers
17
+ attr_reader :kernels
18
+ attr_reader :root_command
19
+
20
+ # An array of structs definitions ([Types::StructType] instances) that should be
21
+ # generated for this program.
22
+ attr_reader :structs
23
+
24
+ # An array of array command structs.
25
+ attr_reader :array_command_structs
26
+
27
+ def initialize(environment_builder:, root_command:)
28
+ @kernel_launchers = []
29
+ @kernels = Set.new([])
30
+ @environment_builder = environment_builder
31
+ @root_command = root_command
32
+
33
+ # The collection of structs is a [Set]. Struct types are unique, i.e., there
34
+ # are never two equal struct types with different object identity.
35
+ @structs = Set.new
36
+ @array_command_structs = Set.new
37
+ end
38
+
39
+ def add_array_command_struct(*structs)
40
+ for struct in structs
41
+ array_command_structs.add(struct)
42
+ end
43
+ end
44
+
45
+ def add_kernel_launcher(launcher)
46
+ @kernel_launchers.push(launcher)
47
+ end
48
+
49
+ # Generates the source code for the CUDA program, compiles it with nvcc and
50
+ # executes the program.
51
+ def execute
52
+ source = build_program
53
+
54
+ launcher = Launcher.new(
55
+ source: source,
56
+ environment_builder: environment_builder,
57
+ result_type: result_type,
58
+ root_command: root_command)
59
+
60
+ launcher.compile
61
+ return launcher.execute
62
+ end
63
+
64
+ # Build kernel invocations
65
+ def build_kernel_launchers
66
+ return kernel_launchers.map do |launcher|
67
+ launcher.build_kernel_launcher
68
+ end.join("")
69
+ end
70
+
71
+ protected
72
+
73
+ def assert_ready_to_build
74
+ if kernel_launchers.size == 0
75
+ raise AssertionError.new(
76
+ "Not ready to build (ProgramBuilder): No kernel launcher defined")
77
+ end
78
+ end
79
+
80
+ # Build header of CUDA source code file
81
+ def build_header
82
+ return Translator.read_file(file_name: "header.cpp")
83
+ end
84
+
85
+ # Build environment struct definition
86
+ def build_environment_struct
87
+ return environment_builder.build_environment_struct
88
+ end
89
+
90
+ # Generate all struct types (except for array command struct types).
91
+ def build_struct_types
92
+ return structs.map do |struct_type|
93
+ struct_type.generate_definition
94
+ end.join("\n") + "\n"
95
+ end
96
+
97
+ def build_array_command_struct_types
98
+ return array_command_structs.to_a.join("\n") + "\n"
99
+ end
100
+
101
+ def all_kernel_builders
102
+ return kernel_launchers.map do |launcher|
103
+ launcher.kernel_builders
104
+ end.flatten
105
+ end
106
+
107
+ # Build methods, blocks and kernels
108
+ def build_kernels
109
+ result = ""
110
+
111
+ for builder in all_kernel_builders
112
+ # Check whether kernel was already build before
113
+ if kernels.include?(builder)
114
+ next
115
+ else
116
+ kernels.add(builder)
117
+ end
118
+
119
+ result = result + builder.build_methods
120
+ result = result + builder.build_blocks
121
+ result = result + builder.build_kernel
122
+ end
123
+
124
+ return result
125
+ end
126
+
127
+ def host_result_expression
128
+ # Read some fields from last kernel launch configuration
129
+ result_device_ptr = kernel_launchers.last.kernel_result_var_name
130
+ result_c_type = kernel_launchers.last.result_type.to_c_type
131
+ result_size = root_command.size
132
+
133
+ if result_device_ptr == nil
134
+ raise AssertionError.new(
135
+ "Result variable name of final kernel launcher not set")
136
+ end
137
+
138
+ # Build result values: `variable_size_array_t` struct. This struct contains a
139
+ # pointer to the result array and stores the size of the result.
140
+ result_device_variable_array_t = "variable_size_array_t((void *) #{result_device_ptr}, #{result_size})"
141
+
142
+ return Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
143
+ "type" => result_c_type,
144
+ "device_array" => result_device_variable_array_t})
145
+ end
146
+
147
+ # Returns the result type of this program. The result type must always be a
148
+ # union type that includes a [Types::LocationAwareArrayType] object,
149
+ # because this way we can support return types where the inner type of an array
150
+ # is unknown at compile time.
151
+ def result_type
152
+ return Types::LocationAwareVariableSizeArrayType.new(
153
+ kernel_launchers.last.result_type,
154
+ location: :host).to_union_type
155
+ end
156
+
157
+ # Free device memory
158
+ def build_memory_free
159
+ result = ""
160
+
161
+ for launcher in kernel_launchers
162
+ if !launcher.reuse_memory?
163
+ result = result + launcher.build_device_memory_free
164
+ end
165
+ end
166
+
167
+ return result
168
+ end
169
+
170
+ # Build the struct type for `result_t`.
171
+ def build_header_structs
172
+ header_structs = Translator.read_file(file_name: "header_structs.cpp",
173
+ replacements: {"result_type" => result_type.to_c_type})
174
+ end
175
+
176
+ # Builds the CUDA program. Returns the source code string.
177
+ def build_program
178
+ assert_ready_to_build
179
+
180
+ result = build_header + build_struct_types + build_header_structs +
181
+ build_array_command_struct_types + build_environment_struct +
182
+ build_kernels
183
+
184
+ # Build program entry point
185
+ return result + Translator.read_file(file_name: "entry_point.cpp", replacements: {
186
+ "prepare_environment" => environment_builder.build_environment_variable,
187
+ "launch_all_kernels" => build_kernel_launchers,
188
+ "free_device_memory" => build_memory_free,
189
+ "host_env_var_name" => Constants::ENV_HOST_IDENTIFIER,
190
+ "host_result_array" => host_result_expression})
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
196
+
197
+ require_relative "program_launcher"