ikra 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ast/builder.rb +225 -77
  3. data/lib/ast/host_section_builder.rb +38 -0
  4. data/lib/ast/interpreter.rb +67 -0
  5. data/lib/ast/lexical_variables_enumerator.rb +3 -2
  6. data/lib/ast/nodes.rb +521 -31
  7. data/lib/ast/printer.rb +116 -18
  8. data/lib/ast/ssa_generator.rb +192 -0
  9. data/lib/ast/visitor.rb +235 -21
  10. data/lib/config/configuration.rb +28 -3
  11. data/lib/config/os_configuration.rb +62 -9
  12. data/lib/cpu/cpu_implementation.rb +39 -0
  13. data/lib/ikra.rb +13 -3
  14. data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
  15. data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
  16. data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
  17. data/lib/resources/cuda/ast/assignment.cpp +1 -0
  18. data/lib/resources/cuda/block_function_head.cpp +7 -1
  19. data/lib/resources/cuda/entry_point.cpp +47 -0
  20. data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
  21. data/lib/resources/cuda/free_device_memory.cpp +3 -0
  22. data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
  23. data/lib/resources/cuda/header.cpp +23 -9
  24. data/lib/resources/cuda/header_structs.cpp +92 -0
  25. data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
  26. data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
  27. data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
  28. data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
  29. data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
  30. data/lib/resources/cuda/kernel.cpp +9 -2
  31. data/lib/resources/cuda/launch_kernel.cpp +5 -0
  32. data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
  33. data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
  34. data/lib/resources/cuda/reduce_body.cpp +88 -0
  35. data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
  36. data/lib/resources/cuda/stencil_body.cpp +16 -0
  37. data/lib/resources/cuda/struct_definition.cpp +4 -0
  38. data/lib/ruby_core/array.rb +34 -0
  39. data/lib/ruby_core/array_command.rb +313 -0
  40. data/lib/ruby_core/core.rb +103 -0
  41. data/lib/ruby_core/interpreter.rb +16 -0
  42. data/lib/ruby_core/math.rb +32 -0
  43. data/lib/ruby_core/ruby_integration.rb +256 -0
  44. data/lib/symbolic/host_section.rb +115 -0
  45. data/lib/symbolic/input.rb +87 -0
  46. data/lib/symbolic/input_visitor.rb +68 -0
  47. data/lib/symbolic/symbolic.rb +793 -117
  48. data/lib/symbolic/visitor.rb +70 -8
  49. data/lib/translator/array_command_struct_builder.rb +163 -0
  50. data/lib/translator/ast_translator.rb +572 -0
  51. data/lib/translator/block_translator.rb +104 -48
  52. data/lib/translator/commands/array_combine_command.rb +41 -0
  53. data/lib/translator/commands/array_identity_command.rb +28 -0
  54. data/lib/translator/commands/array_index_command.rb +52 -0
  55. data/lib/translator/commands/array_reduce_command.rb +135 -0
  56. data/lib/translator/commands/array_stencil_command.rb +129 -0
  57. data/lib/translator/commands/array_zip_command.rb +30 -0
  58. data/lib/translator/commands/command_translator.rb +264 -0
  59. data/lib/translator/cuda_errors.rb +32 -0
  60. data/lib/translator/environment_builder.rb +263 -0
  61. data/lib/translator/host_section/array_host_section_command.rb +150 -0
  62. data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
  63. data/lib/translator/host_section/ast_translator.rb +14 -0
  64. data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
  65. data/lib/translator/host_section/program_builder.rb +89 -0
  66. data/lib/translator/input_translator.rb +226 -0
  67. data/lib/translator/kernel_builder.rb +137 -0
  68. data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
  69. data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
  70. data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
  71. data/lib/translator/last_returns_visitor.rb +19 -10
  72. data/lib/translator/program_builder.rb +197 -0
  73. data/lib/translator/program_launcher.rb +273 -0
  74. data/lib/translator/struct_type.rb +55 -0
  75. data/lib/translator/translator.rb +34 -11
  76. data/lib/translator/variable_classifier_visitor.rb +56 -0
  77. data/lib/types/inference/ast_inference.rb +586 -0
  78. data/lib/types/inference/clear_types_visitor.rb +11 -0
  79. data/lib/types/inference/command_inference.rb +101 -0
  80. data/lib/types/inference/input_inference.rb +62 -0
  81. data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
  82. data/lib/types/inference/ruby_extension.rb +35 -0
  83. data/lib/types/inference/symbol_table.rb +131 -0
  84. data/lib/types/types.rb +14 -0
  85. data/lib/types/types/array_command_type.rb +123 -0
  86. data/lib/types/types/array_type.rb +137 -0
  87. data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
  88. data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
  89. data/lib/types/types/ruby_type.rb +88 -0
  90. data/lib/types/types/struct_type.rb +179 -0
  91. data/lib/types/types/union_type.rb +239 -0
  92. metadata +160 -18
  93. data/lib/ast/method_definition.rb +0 -37
  94. data/lib/ast/translator.rb +0 -264
  95. data/lib/resources/cuda/kernel_launcher.cpp +0 -28
  96. data/lib/scope.rb +0 -166
  97. data/lib/translator/command_translator.rb +0 -421
  98. data/lib/translator/local_variables_enumerator.rb +0 -35
  99. data/lib/translator/method_translator.rb +0 -24
  100. data/lib/types/array_type.rb +0 -51
  101. data/lib/types/ruby_extension.rb +0 -67
  102. data/lib/types/ruby_type.rb +0 -45
  103. data/lib/types/type_inference.rb +0 -382
  104. data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,40 @@
1
+ module Ikra
2
+ module Translator
3
+ class CommandTranslator
4
+ class ForLoopKernelLauncher < KernelLauncher
5
+ def initialize(
6
+ kernel_builder:,
7
+ from_expr: "0",
8
+ to_expr:,
9
+ var_name: "i",
10
+ before_loop: "")
11
+
12
+ super(kernel_builder)
13
+ @from_expr = from_expr
14
+ @to_expr = to_expr
15
+ @var_name = var_name
16
+ @before_loop = before_loop
17
+ end
18
+
19
+ attr_reader :from_expr
20
+ attr_reader :to_expr
21
+ attr_reader :var_name
22
+ attr_reader :before_loop
23
+
24
+ def build_kernel_launcher
25
+ Log.info("Building for-loop kernel launcher")
26
+
27
+ assert_ready_to_build
28
+
29
+ result = before_loop + "\n"
30
+ result = result + "for (int #{var_name} = #{from_expr}; #{var_name} < #{to_expr}; #{var_name} ++)\n{"
31
+
32
+ result = result + super
33
+ result = result + "\n}\n"
34
+
35
+ return result
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,259 @@
1
+ module Ikra
2
+ module Translator
3
+ class CommandTranslator
4
+
5
+ # Builds the launch of the kernel. This class is responsible for generating the
6
+ # invocation of the kernel.
7
+ #
8
+ # For example:
9
+ # kernel<<<..., ...>>>(env, result, d_a, ...);
10
+ class KernelLauncher
11
+ class << self
12
+ # Debug flag only: Frees all input after launching kernel. This causes an
13
+ # error if data is used twice or kept (using the `keep` flag)
14
+ attr_accessor :debug_free_previous_input_immediately
15
+ end
16
+
17
+ attr_accessor :kernel_builder
18
+
19
+ # Additional parameters that this kernel should accept (to access the result
20
+ # of previous kernels)
21
+ attr_accessor :previous_kernel_input
22
+
23
+ # Additional parameters that this kernel should accept (to access the result
24
+ # of previous kernels)
25
+ attr_accessor :additional_arguments
26
+
27
+ # Number of threads (elements to be processed)
28
+ attr_accessor :num_threads
29
+
30
+ # Block/grid dimensions (should be 1D)
31
+ attr_accessor :grid_dim
32
+ attr_accessor :block_dim
33
+
34
+ # Whether the launch allocates new memory beforehand or uses previous memory
35
+ attr_accessor :reuse_memory
36
+
37
+ # Pointer to the resulting array (device memory)
38
+ attr_reader :kernel_result_var_name
39
+
40
+ # IDs and types of commands whose results are kept on the GPU
41
+ attr_accessor :cached_results
42
+
43
+ # IDs and types of commands that were previously computed and shall now be used in this kernel as input
44
+ attr_reader :previously_cached_results
45
+
46
+ def initialize(kernel_builder)
47
+ @kernel_builder = kernel_builder
48
+ @additional_arguments = []
49
+ @previous_kernel_input = []
50
+ @reuse_memory = false
51
+ @kernel_result_var_name = "_kernel_result_" + CommandTranslator.next_unique_id.to_s
52
+ @cached_results = {}
53
+ @previously_cached_results = {}
54
+ end
55
+
56
+ # Some of the values stored in `@additional_arguments` might be blocks, because
57
+ # not all information was known when adding something to that list. This method
58
+ # replaces those blocks (evaluates them) with actual strings, based on the command
59
+ # that is being launched.
60
+ def prepare_additional_args_for_launch(command)
61
+ @additional_arguments = @additional_arguments.map do |arg|
62
+ if arg.is_a?(String)
63
+ arg
64
+ else
65
+ arg.call(command)
66
+ end
67
+ end
68
+ end
69
+
70
+ def kernel_builders
71
+ # The program builder accesses kernel builders via kernel launchers through
72
+ # this method, because some specialized launchers might have multiple kernel
73
+ # builders.
74
+ return [kernel_builder]
75
+ end
76
+
77
+ # Adds command whose result will be kept on GPU
78
+ def add_cached_result(result_id, type)
79
+ @cached_results[result_id] = type
80
+ end
81
+
82
+ # Adds a previously computed result which will be used in this launche as input
83
+ def use_cached_result(result_id, type)
84
+ @previously_cached_results[result_id] = type
85
+ end
86
+
87
+ def reuse_memory!(parameter_name)
88
+ @reuse_memory = true
89
+ @kernel_result_var_name = parameter_name
90
+ end
91
+
92
+ def reuse_memory?
93
+ return @reuse_memory
94
+ end
95
+
96
+ def add_previous_kernel_parameter(parameter)
97
+ kernel_builder.add_previous_kernel_parameter(parameter)
98
+ end
99
+
100
+ # Add additional arguments to the kernel function that might be needed for some computations
101
+ def add_additional_arguments(*arguments)
102
+ @additional_arguments.push(*arguments)
103
+ end
104
+
105
+ # The result type of this kernel launcher. Same as the result type of its kernel
106
+ # builder.
107
+ def result_type
108
+ return kernel_builder.result_type
109
+ end
110
+
111
+ # The size of the result array is the number of threads.
112
+ def result_size
113
+ return num_threads
114
+ end
115
+
116
+ # Configures grid size and block size. Also sets number of threads.
117
+ def configure_grid(size, block_size: 256)
118
+ if block_size == nil
119
+ block_size = 256
120
+ end
121
+
122
+ if size.is_a?(Fixnum)
123
+ # Precompute constants
124
+ @grid_dim = [size.fdiv(block_size).ceil, 1].max.to_s
125
+ @block_dim = (size >= block_size ? block_size : size).to_s
126
+ @num_threads = size
127
+ else
128
+ if !size.is_a?(String)
129
+ raise AssertionError.new("Fixnum or String expected")
130
+ end
131
+
132
+ # Source code string determines the size
133
+ @grid_dim = "max((int) ceil(((float) #{size}) / #{block_size}), 1)"
134
+ @block_dim = "(#{size} >= #{block_size} ? #{block_size} : #{size})"
135
+ @num_threads = size
136
+ end
137
+ end
138
+
139
+ def assert_ready_to_build
140
+ required_values = [:num_threads, :grid_dim, :block_dim]
141
+
142
+ for selector in required_values
143
+ if send(selector) == nil
144
+ raise AssertionError.new(
145
+ "Not ready to build (KernelBuilder): #{selector} is not set")
146
+ end
147
+ end
148
+ end
149
+
150
+
151
+ # Build the code that launches this kernel. The generated code performs the
152
+ # following steps:
153
+ #
154
+ # 1. Allocate device memory for the result.
155
+ # 2. If result should be written back: Allocate host memory for the result.
156
+ # 3. Launch the kernel (+ error checking, synchronization)
157
+ # 4. If result should be written back: Copy result back to host memory.
158
+ def build_kernel_launcher
159
+
160
+ Log.info("Building kernel launcher")
161
+
162
+ assert_ready_to_build
163
+
164
+ result = ""
165
+ if !reuse_memory
166
+ # Allocate device memory for kernel result
167
+ result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
168
+ "name" => kernel_result_var_name,
169
+ "bytes" => "(sizeof(#{kernel_builder.result_type.to_c_type}) * #{num_threads})",
170
+ "type" => kernel_builder.result_type.to_c_type})
171
+ end
172
+
173
+ previously_cached_results.each do |result_id, type|
174
+ result = result + " #{type.to_c_type} *prev_" + result_id.to_s + " = (#{type.to_c_type} *) " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id.to_s + ";\n"
175
+ end
176
+
177
+ # Allocate device memory for cached results
178
+ cached_results.each do |result_id, type|
179
+ result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
180
+ "name" => Constants::RESULT_IDENTIFIER + result_id,
181
+ "bytes" => "(#{type.c_size} * #{num_threads})",
182
+ "type" => type.to_c_type})
183
+ end
184
+
185
+ # Build arguments
186
+ a_env = Constants::ENV_DEVICE_IDENTIFIER
187
+ a_result = kernel_result_var_name
188
+
189
+ previous_kernel_args = []
190
+ for var in kernel_builder.previous_kernel_input
191
+ previous_kernel_args.push(var.name.to_s)
192
+ end
193
+
194
+ a_cached_results = cached_results.map do |result_id, type|
195
+ Constants::RESULT_IDENTIFIER + result_id
196
+ end
197
+
198
+ if reuse_memory
199
+ previous_kernel_args[0] = a_result
200
+ end
201
+
202
+ arguments = ([a_env, num_threads, a_result] + a_cached_results + previous_kernel_args + additional_arguments).join(", ")
203
+
204
+ # Launch kernel
205
+ result = result + Translator.read_file(file_name: "launch_kernel.cpp", replacements: {
206
+ "kernel_name" => kernel_builder.kernel_name,
207
+ "arguments" => arguments,
208
+ "grid_dim" => grid_dim,
209
+ "block_dim" => block_dim})
210
+
211
+ # ---- DEBUG ONLY: Free input after computation so that we can process larger
212
+ # data sets in benchmarks without running out of memory
213
+ # TODO: Implement analysis and do this automatically
214
+ if KernelLauncher.debug_free_previous_input_immediately == true
215
+ for var in kernel_builder.previous_kernel_input
216
+ result = result + Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
217
+ "name" => var.name.to_s})
218
+ end
219
+ end
220
+ # ---- END DEBUG ONLY
221
+
222
+ cached_results.each do |result_id, type|
223
+ result = result + " " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id + " = " + Constants::RESULT_IDENTIFIER + result_id + ";\n"
224
+ end
225
+
226
+ return result
227
+ end
228
+
229
+ def build_device_memory_free
230
+ Log.info("Building kernel post-launch CUDA free")
231
+
232
+ assert_ready_to_build
233
+
234
+ if KernelLauncher.debug_free_previous_input_immediately == true
235
+ Log.warn("Debug flag set... Freeing input memory immediately and some memory not at all!")
236
+ return ""
237
+ end
238
+
239
+ return Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
240
+ "name" => kernel_result_var_name})
241
+ end
242
+
243
+ # Same as above, but also removes item from the list of allocated memory chunks.
244
+ def build_device_memory_free_in_host_section
245
+ Log.info("Building kernel post-launch CUDA free (host section")
246
+
247
+ assert_ready_to_build
248
+
249
+ return Translator.read_file(file_name: "host_section_free_device_memory.cpp", replacements: {
250
+ "name" => kernel_result_var_name})
251
+ end
252
+ end
253
+ end
254
+ end
255
+ end
256
+
257
+ require_relative "../kernel_builder"
258
+ require_relative "for_loop_kernel_launcher"
259
+ require_relative "while_loop_kernel_launcher"
@@ -0,0 +1,38 @@
1
+ module Ikra
2
+ module Translator
3
+ class CommandTranslator
4
+ class WhileLoopKernelLauncher < KernelLauncher
5
+ def initialize(
6
+ kernel_builder:,
7
+ condition:,
8
+ before_loop: "",
9
+ post_iteration: "")
10
+
11
+ super(kernel_builder)
12
+ @condition = condition
13
+ @before_loop = before_loop
14
+ @post_iteration = post_iteration
15
+ end
16
+
17
+ attr_reader :condition
18
+ attr_reader :before_loop
19
+ attr_reader :post_iteration
20
+
21
+ def build_kernel_launcher
22
+ Log.info("Building for-loop kernel launcher")
23
+
24
+ assert_ready_to_build
25
+
26
+ result = ""
27
+ result = result + before_loop + "\n"
28
+ result = result + "while (#{condition}) {\n"
29
+ result = result + super
30
+ result = result + "\n" + post_iteration
31
+ result = result + "\n}\n"
32
+
33
+ return result
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -3,37 +3,42 @@ require_relative "../ast/visitor"
3
3
 
4
4
  module Ikra
5
5
  module Translator
6
+ # Visitor that replaces implicit returns with explicit ones
6
7
  class LastStatementReturnsVisitor < AST::Visitor
7
8
  def visit_root_node(node)
8
- node.child.accept(self)
9
+ node.single_child.accept(self)
9
10
  end
10
11
 
11
12
  def visit_lvar_read_node(node)
12
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
13
+ process_node(node)
13
14
  end
14
15
 
15
16
  def visit_lvar_write_node(node)
16
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
17
+ process_node(node)
17
18
  end
18
19
 
19
20
  def visit_int_node(node)
20
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
21
+ process_node(node)
21
22
  end
22
23
 
23
24
  def visit_float_node(node)
24
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
25
+ process_node(node)
25
26
  end
26
27
 
27
28
  def visit_bool_node(node)
28
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
29
+ process_node(node)
30
+ end
31
+
32
+ def visit_nil_node(node)
33
+ process_node(node)
29
34
  end
30
35
 
31
36
  def visit_for_node(node)
32
- raise "Cannot handle for loop as return value"
37
+ raise NotImplementedError.new("Cannot handle for loop as return value")
33
38
  end
34
39
 
35
40
  def visit_break_node(node)
36
- raise "Break must not be a return value"
41
+ raise AssertionError.new("Break must not be a return value")
37
42
  end
38
43
 
39
44
  def visit_if_node(node)
@@ -46,11 +51,15 @@ module Ikra
46
51
  end
47
52
 
48
53
  def visit_send_node(node)
49
- node.parent.replace_child(node, AST::ReturnNode.new(value: node))
54
+ process_node(node)
50
55
  end
51
56
 
52
57
  def visit_return_node(node)
53
- raise "Function returns already"
58
+ # Do nothing
59
+ end
60
+
61
+ def process_node(node)
62
+ node.parent.replace_child(node, AST::ReturnNode.new(value: node))
54
63
  end
55
64
  end
56
65
  end
@@ -0,0 +1,197 @@
1
+ require "tempfile"
2
+ require "set"
3
+
4
+ module Ikra
5
+ module Translator
6
+ class CommandTranslator
7
+
8
+ # Builds the entire CUDA program. A CUDA program may consist of multiple kernels, but
9
+ # has at least one kernel. The generated code performs the following steps:
10
+ #
11
+ # 1. Insert header of CUDA file.
12
+ # 2. For every kernel: Build all methods, blocks, and kernels.
13
+ # 3. Build the program entry point (including kernel launchers).
14
+ class ProgramBuilder
15
+ attr_reader :environment_builder
16
+ attr_reader :kernel_launchers
17
+ attr_reader :kernels
18
+ attr_reader :root_command
19
+
20
+ # An array of structs definitions ([Types::StructType] instances) that should be
21
+ # generated for this program.
22
+ attr_reader :structs
23
+
24
+ # An array of array command structs.
25
+ attr_reader :array_command_structs
26
+
27
+ def initialize(environment_builder:, root_command:)
28
+ @kernel_launchers = []
29
+ @kernels = Set.new([])
30
+ @environment_builder = environment_builder
31
+ @root_command = root_command
32
+
33
+ # The collection of structs is a [Set]. Struct types are unique, i.e., there
34
+ # are never two equal struct types with different object identity.
35
+ @structs = Set.new
36
+ @array_command_structs = Set.new
37
+ end
38
+
39
+ def add_array_command_struct(*structs)
40
+ for struct in structs
41
+ array_command_structs.add(struct)
42
+ end
43
+ end
44
+
45
+ def add_kernel_launcher(launcher)
46
+ @kernel_launchers.push(launcher)
47
+ end
48
+
49
+ # Generates the source code for the CUDA program, compiles it with nvcc and
50
+ # executes the program.
51
+ def execute
52
+ source = build_program
53
+
54
+ launcher = Launcher.new(
55
+ source: source,
56
+ environment_builder: environment_builder,
57
+ result_type: result_type,
58
+ root_command: root_command)
59
+
60
+ launcher.compile
61
+ return launcher.execute
62
+ end
63
+
64
+ # Build kernel invocations
65
+ def build_kernel_launchers
66
+ return kernel_launchers.map do |launcher|
67
+ launcher.build_kernel_launcher
68
+ end.join("")
69
+ end
70
+
71
+ protected
72
+
73
+ def assert_ready_to_build
74
+ if kernel_launchers.size == 0
75
+ raise AssertionError.new(
76
+ "Not ready to build (ProgramBuilder): No kernel launcher defined")
77
+ end
78
+ end
79
+
80
+ # Build header of CUDA source code file
81
+ def build_header
82
+ return Translator.read_file(file_name: "header.cpp")
83
+ end
84
+
85
+ # Build environment struct definition
86
+ def build_environment_struct
87
+ return environment_builder.build_environment_struct
88
+ end
89
+
90
+ # Generate all struct types (except for array command struct types).
91
+ def build_struct_types
92
+ return structs.map do |struct_type|
93
+ struct_type.generate_definition
94
+ end.join("\n") + "\n"
95
+ end
96
+
97
+ def build_array_command_struct_types
98
+ return array_command_structs.to_a.join("\n") + "\n"
99
+ end
100
+
101
+ def all_kernel_builders
102
+ return kernel_launchers.map do |launcher|
103
+ launcher.kernel_builders
104
+ end.flatten
105
+ end
106
+
107
+ # Build methods, blocks and kernels
108
+ def build_kernels
109
+ result = ""
110
+
111
+ for builder in all_kernel_builders
112
+ # Check whether kernel was already build before
113
+ if kernels.include?(builder)
114
+ next
115
+ else
116
+ kernels.add(builder)
117
+ end
118
+
119
+ result = result + builder.build_methods
120
+ result = result + builder.build_blocks
121
+ result = result + builder.build_kernel
122
+ end
123
+
124
+ return result
125
+ end
126
+
127
+ def host_result_expression
128
+ # Read some fields from last kernel launch configuration
129
+ result_device_ptr = kernel_launchers.last.kernel_result_var_name
130
+ result_c_type = kernel_launchers.last.result_type.to_c_type
131
+ result_size = root_command.size
132
+
133
+ if result_device_ptr == nil
134
+ raise AssertionError.new(
135
+ "Result variable name of final kernel launcher not set")
136
+ end
137
+
138
+ # Build result values: `variable_size_array_t` struct. This struct contains a
139
+ # pointer to the result array and stores the size of the result.
140
+ result_device_variable_array_t = "variable_size_array_t((void *) #{result_device_ptr}, #{result_size})"
141
+
142
+ return Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
143
+ "type" => result_c_type,
144
+ "device_array" => result_device_variable_array_t})
145
+ end
146
+
147
+ # Returns the result type of this program. The result type must always be a
148
+ # union type that includes a [Types::LocationAwareArrayType] object,
149
+ # because this way we can support return types where the inner type of an array
150
+ # is unknown at compile time.
151
+ def result_type
152
+ return Types::LocationAwareVariableSizeArrayType.new(
153
+ kernel_launchers.last.result_type,
154
+ location: :host).to_union_type
155
+ end
156
+
157
+ # Free device memory
158
+ def build_memory_free
159
+ result = ""
160
+
161
+ for launcher in kernel_launchers
162
+ if !launcher.reuse_memory?
163
+ result = result + launcher.build_device_memory_free
164
+ end
165
+ end
166
+
167
+ return result
168
+ end
169
+
170
+ # Build the struct type for `result_t`.
171
+ def build_header_structs
172
+ header_structs = Translator.read_file(file_name: "header_structs.cpp",
173
+ replacements: {"result_type" => result_type.to_c_type})
174
+ end
175
+
176
+ # Builds the CUDA program. Returns the source code string.
177
+ def build_program
178
+ assert_ready_to_build
179
+
180
+ result = build_header + build_struct_types + build_header_structs +
181
+ build_array_command_struct_types + build_environment_struct +
182
+ build_kernels
183
+
184
+ # Build program entry point
185
+ return result + Translator.read_file(file_name: "entry_point.cpp", replacements: {
186
+ "prepare_environment" => environment_builder.build_environment_variable,
187
+ "launch_all_kernels" => build_kernel_launchers,
188
+ "free_device_memory" => build_memory_free,
189
+ "host_env_var_name" => Constants::ENV_HOST_IDENTIFIER,
190
+ "host_result_array" => host_result_expression})
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
196
+
197
+ require_relative "program_launcher"