ikra 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/ast/builder.rb +225 -77
- data/lib/ast/host_section_builder.rb +38 -0
- data/lib/ast/interpreter.rb +67 -0
- data/lib/ast/lexical_variables_enumerator.rb +3 -2
- data/lib/ast/nodes.rb +521 -31
- data/lib/ast/printer.rb +116 -18
- data/lib/ast/ssa_generator.rb +192 -0
- data/lib/ast/visitor.rb +235 -21
- data/lib/config/configuration.rb +28 -3
- data/lib/config/os_configuration.rb +62 -9
- data/lib/cpu/cpu_implementation.rb +39 -0
- data/lib/ikra.rb +13 -3
- data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
- data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
- data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
- data/lib/resources/cuda/ast/assignment.cpp +1 -0
- data/lib/resources/cuda/block_function_head.cpp +7 -1
- data/lib/resources/cuda/entry_point.cpp +47 -0
- data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
- data/lib/resources/cuda/free_device_memory.cpp +3 -0
- data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
- data/lib/resources/cuda/header.cpp +23 -9
- data/lib/resources/cuda/header_structs.cpp +92 -0
- data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
- data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
- data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
- data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
- data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
- data/lib/resources/cuda/kernel.cpp +9 -2
- data/lib/resources/cuda/launch_kernel.cpp +5 -0
- data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
- data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
- data/lib/resources/cuda/reduce_body.cpp +88 -0
- data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
- data/lib/resources/cuda/stencil_body.cpp +16 -0
- data/lib/resources/cuda/struct_definition.cpp +4 -0
- data/lib/ruby_core/array.rb +34 -0
- data/lib/ruby_core/array_command.rb +313 -0
- data/lib/ruby_core/core.rb +103 -0
- data/lib/ruby_core/interpreter.rb +16 -0
- data/lib/ruby_core/math.rb +32 -0
- data/lib/ruby_core/ruby_integration.rb +256 -0
- data/lib/symbolic/host_section.rb +115 -0
- data/lib/symbolic/input.rb +87 -0
- data/lib/symbolic/input_visitor.rb +68 -0
- data/lib/symbolic/symbolic.rb +793 -117
- data/lib/symbolic/visitor.rb +70 -8
- data/lib/translator/array_command_struct_builder.rb +163 -0
- data/lib/translator/ast_translator.rb +572 -0
- data/lib/translator/block_translator.rb +104 -48
- data/lib/translator/commands/array_combine_command.rb +41 -0
- data/lib/translator/commands/array_identity_command.rb +28 -0
- data/lib/translator/commands/array_index_command.rb +52 -0
- data/lib/translator/commands/array_reduce_command.rb +135 -0
- data/lib/translator/commands/array_stencil_command.rb +129 -0
- data/lib/translator/commands/array_zip_command.rb +30 -0
- data/lib/translator/commands/command_translator.rb +264 -0
- data/lib/translator/cuda_errors.rb +32 -0
- data/lib/translator/environment_builder.rb +263 -0
- data/lib/translator/host_section/array_host_section_command.rb +150 -0
- data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
- data/lib/translator/host_section/ast_translator.rb +14 -0
- data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
- data/lib/translator/host_section/program_builder.rb +89 -0
- data/lib/translator/input_translator.rb +226 -0
- data/lib/translator/kernel_builder.rb +137 -0
- data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
- data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
- data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
- data/lib/translator/last_returns_visitor.rb +19 -10
- data/lib/translator/program_builder.rb +197 -0
- data/lib/translator/program_launcher.rb +273 -0
- data/lib/translator/struct_type.rb +55 -0
- data/lib/translator/translator.rb +34 -11
- data/lib/translator/variable_classifier_visitor.rb +56 -0
- data/lib/types/inference/ast_inference.rb +586 -0
- data/lib/types/inference/clear_types_visitor.rb +11 -0
- data/lib/types/inference/command_inference.rb +101 -0
- data/lib/types/inference/input_inference.rb +62 -0
- data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
- data/lib/types/inference/ruby_extension.rb +35 -0
- data/lib/types/inference/symbol_table.rb +131 -0
- data/lib/types/types.rb +14 -0
- data/lib/types/types/array_command_type.rb +123 -0
- data/lib/types/types/array_type.rb +137 -0
- data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
- data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
- data/lib/types/types/ruby_type.rb +88 -0
- data/lib/types/types/struct_type.rb +179 -0
- data/lib/types/types/union_type.rb +239 -0
- metadata +160 -18
- data/lib/ast/method_definition.rb +0 -37
- data/lib/ast/translator.rb +0 -264
- data/lib/resources/cuda/kernel_launcher.cpp +0 -28
- data/lib/scope.rb +0 -166
- data/lib/translator/command_translator.rb +0 -421
- data/lib/translator/local_variables_enumerator.rb +0 -35
- data/lib/translator/method_translator.rb +0 -24
- data/lib/types/array_type.rb +0 -51
- data/lib/types/ruby_extension.rb +0 -67
- data/lib/types/ruby_type.rb +0 -45
- data/lib/types/type_inference.rb +0 -382
- data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,40 @@
|
|
1
|
+
module Ikra
|
2
|
+
module Translator
|
3
|
+
class CommandTranslator
|
4
|
+
class ForLoopKernelLauncher < KernelLauncher
|
5
|
+
def initialize(
|
6
|
+
kernel_builder:,
|
7
|
+
from_expr: "0",
|
8
|
+
to_expr:,
|
9
|
+
var_name: "i",
|
10
|
+
before_loop: "")
|
11
|
+
|
12
|
+
super(kernel_builder)
|
13
|
+
@from_expr = from_expr
|
14
|
+
@to_expr = to_expr
|
15
|
+
@var_name = var_name
|
16
|
+
@before_loop = before_loop
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :from_expr
|
20
|
+
attr_reader :to_expr
|
21
|
+
attr_reader :var_name
|
22
|
+
attr_reader :before_loop
|
23
|
+
|
24
|
+
def build_kernel_launcher
|
25
|
+
Log.info("Building for-loop kernel launcher")
|
26
|
+
|
27
|
+
assert_ready_to_build
|
28
|
+
|
29
|
+
result = before_loop + "\n"
|
30
|
+
result = result + "for (int #{var_name} = #{from_expr}; #{var_name} < #{to_expr}; #{var_name} ++)\n{"
|
31
|
+
|
32
|
+
result = result + super
|
33
|
+
result = result + "\n}\n"
|
34
|
+
|
35
|
+
return result
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
module Ikra
|
2
|
+
module Translator
|
3
|
+
class CommandTranslator
|
4
|
+
|
5
|
+
# Builds the launch of the kernel. This class is responsible for generating the
|
6
|
+
# invocation of the kernel.
|
7
|
+
#
|
8
|
+
# For example:
|
9
|
+
# kernel<<<..., ...>>>(env, result, d_a, ...);
|
10
|
+
class KernelLauncher
|
11
|
+
class << self
|
12
|
+
# Debug flag only: Frees all input after launching kernel. This causes an
|
13
|
+
# error if data is used twice or kept (using the `keep` flag)
|
14
|
+
attr_accessor :debug_free_previous_input_immediately
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :kernel_builder
|
18
|
+
|
19
|
+
# Additional parameters that this kernel should accept (to access the result
|
20
|
+
# of previous kernels)
|
21
|
+
attr_accessor :previous_kernel_input
|
22
|
+
|
23
|
+
# Additional parameters that this kernel should accept (to access the result
|
24
|
+
# of previous kernels)
|
25
|
+
attr_accessor :additional_arguments
|
26
|
+
|
27
|
+
# Number of threads (elements to be processed)
|
28
|
+
attr_accessor :num_threads
|
29
|
+
|
30
|
+
# Block/grid dimensions (should be 1D)
|
31
|
+
attr_accessor :grid_dim
|
32
|
+
attr_accessor :block_dim
|
33
|
+
|
34
|
+
# Whether the launch allocates new memory beforehand or uses previous memory
|
35
|
+
attr_accessor :reuse_memory
|
36
|
+
|
37
|
+
# Pointer to the resulting array (device memory)
|
38
|
+
attr_reader :kernel_result_var_name
|
39
|
+
|
40
|
+
# IDs and types of commands whose results are kept on the GPU
|
41
|
+
attr_accessor :cached_results
|
42
|
+
|
43
|
+
# IDs and types of commands that were previously computed and shall now be used in this kernel as input
|
44
|
+
attr_reader :previously_cached_results
|
45
|
+
|
46
|
+
def initialize(kernel_builder)
|
47
|
+
@kernel_builder = kernel_builder
|
48
|
+
@additional_arguments = []
|
49
|
+
@previous_kernel_input = []
|
50
|
+
@reuse_memory = false
|
51
|
+
@kernel_result_var_name = "_kernel_result_" + CommandTranslator.next_unique_id.to_s
|
52
|
+
@cached_results = {}
|
53
|
+
@previously_cached_results = {}
|
54
|
+
end
|
55
|
+
|
56
|
+
# Some of the values stored in `@additional_arguments` might be blocks, because
|
57
|
+
# not all information was known when adding something to that list. This method
|
58
|
+
# replaces those blocks (evaluates them) with actual strings, based on the command
|
59
|
+
# that is being launched.
|
60
|
+
def prepare_additional_args_for_launch(command)
|
61
|
+
@additional_arguments = @additional_arguments.map do |arg|
|
62
|
+
if arg.is_a?(String)
|
63
|
+
arg
|
64
|
+
else
|
65
|
+
arg.call(command)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def kernel_builders
|
71
|
+
# The program builder accesses kernel builders via kernel launchers through
|
72
|
+
# this method, because some specialized launchers might have multiple kernel
|
73
|
+
# builders.
|
74
|
+
return [kernel_builder]
|
75
|
+
end
|
76
|
+
|
77
|
+
# Adds command whose result will be kept on GPU
|
78
|
+
def add_cached_result(result_id, type)
|
79
|
+
@cached_results[result_id] = type
|
80
|
+
end
|
81
|
+
|
82
|
+
# Adds a previously computed result which will be used in this launche as input
|
83
|
+
def use_cached_result(result_id, type)
|
84
|
+
@previously_cached_results[result_id] = type
|
85
|
+
end
|
86
|
+
|
87
|
+
def reuse_memory!(parameter_name)
|
88
|
+
@reuse_memory = true
|
89
|
+
@kernel_result_var_name = parameter_name
|
90
|
+
end
|
91
|
+
|
92
|
+
def reuse_memory?
|
93
|
+
return @reuse_memory
|
94
|
+
end
|
95
|
+
|
96
|
+
def add_previous_kernel_parameter(parameter)
|
97
|
+
kernel_builder.add_previous_kernel_parameter(parameter)
|
98
|
+
end
|
99
|
+
|
100
|
+
# Add additional arguments to the kernel function that might be needed for some computations
|
101
|
+
def add_additional_arguments(*arguments)
|
102
|
+
@additional_arguments.push(*arguments)
|
103
|
+
end
|
104
|
+
|
105
|
+
# The result type of this kernel launcher. Same as the result type of its kernel
|
106
|
+
# builder.
|
107
|
+
def result_type
|
108
|
+
return kernel_builder.result_type
|
109
|
+
end
|
110
|
+
|
111
|
+
# The size of the result array is the number of threads.
|
112
|
+
def result_size
|
113
|
+
return num_threads
|
114
|
+
end
|
115
|
+
|
116
|
+
# Configures grid size and block size. Also sets number of threads.
|
117
|
+
def configure_grid(size, block_size: 256)
|
118
|
+
if block_size == nil
|
119
|
+
block_size = 256
|
120
|
+
end
|
121
|
+
|
122
|
+
if size.is_a?(Fixnum)
|
123
|
+
# Precompute constants
|
124
|
+
@grid_dim = [size.fdiv(block_size).ceil, 1].max.to_s
|
125
|
+
@block_dim = (size >= block_size ? block_size : size).to_s
|
126
|
+
@num_threads = size
|
127
|
+
else
|
128
|
+
if !size.is_a?(String)
|
129
|
+
raise AssertionError.new("Fixnum or String expected")
|
130
|
+
end
|
131
|
+
|
132
|
+
# Source code string determines the size
|
133
|
+
@grid_dim = "max((int) ceil(((float) #{size}) / #{block_size}), 1)"
|
134
|
+
@block_dim = "(#{size} >= #{block_size} ? #{block_size} : #{size})"
|
135
|
+
@num_threads = size
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def assert_ready_to_build
|
140
|
+
required_values = [:num_threads, :grid_dim, :block_dim]
|
141
|
+
|
142
|
+
for selector in required_values
|
143
|
+
if send(selector) == nil
|
144
|
+
raise AssertionError.new(
|
145
|
+
"Not ready to build (KernelBuilder): #{selector} is not set")
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
# Build the code that launches this kernel. The generated code performs the
|
152
|
+
# following steps:
|
153
|
+
#
|
154
|
+
# 1. Allocate device memory for the result.
|
155
|
+
# 2. If result should be written back: Allocate host memory for the result.
|
156
|
+
# 3. Launch the kernel (+ error checking, synchronization)
|
157
|
+
# 4. If result should be written back: Copy result back to host memory.
|
158
|
+
def build_kernel_launcher
|
159
|
+
|
160
|
+
Log.info("Building kernel launcher")
|
161
|
+
|
162
|
+
assert_ready_to_build
|
163
|
+
|
164
|
+
result = ""
|
165
|
+
if !reuse_memory
|
166
|
+
# Allocate device memory for kernel result
|
167
|
+
result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
|
168
|
+
"name" => kernel_result_var_name,
|
169
|
+
"bytes" => "(sizeof(#{kernel_builder.result_type.to_c_type}) * #{num_threads})",
|
170
|
+
"type" => kernel_builder.result_type.to_c_type})
|
171
|
+
end
|
172
|
+
|
173
|
+
previously_cached_results.each do |result_id, type|
|
174
|
+
result = result + " #{type.to_c_type} *prev_" + result_id.to_s + " = (#{type.to_c_type} *) " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id.to_s + ";\n"
|
175
|
+
end
|
176
|
+
|
177
|
+
# Allocate device memory for cached results
|
178
|
+
cached_results.each do |result_id, type|
|
179
|
+
result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
|
180
|
+
"name" => Constants::RESULT_IDENTIFIER + result_id,
|
181
|
+
"bytes" => "(#{type.c_size} * #{num_threads})",
|
182
|
+
"type" => type.to_c_type})
|
183
|
+
end
|
184
|
+
|
185
|
+
# Build arguments
|
186
|
+
a_env = Constants::ENV_DEVICE_IDENTIFIER
|
187
|
+
a_result = kernel_result_var_name
|
188
|
+
|
189
|
+
previous_kernel_args = []
|
190
|
+
for var in kernel_builder.previous_kernel_input
|
191
|
+
previous_kernel_args.push(var.name.to_s)
|
192
|
+
end
|
193
|
+
|
194
|
+
a_cached_results = cached_results.map do |result_id, type|
|
195
|
+
Constants::RESULT_IDENTIFIER + result_id
|
196
|
+
end
|
197
|
+
|
198
|
+
if reuse_memory
|
199
|
+
previous_kernel_args[0] = a_result
|
200
|
+
end
|
201
|
+
|
202
|
+
arguments = ([a_env, num_threads, a_result] + a_cached_results + previous_kernel_args + additional_arguments).join(", ")
|
203
|
+
|
204
|
+
# Launch kernel
|
205
|
+
result = result + Translator.read_file(file_name: "launch_kernel.cpp", replacements: {
|
206
|
+
"kernel_name" => kernel_builder.kernel_name,
|
207
|
+
"arguments" => arguments,
|
208
|
+
"grid_dim" => grid_dim,
|
209
|
+
"block_dim" => block_dim})
|
210
|
+
|
211
|
+
# ---- DEBUG ONLY: Free input after computation so that we can process larger
|
212
|
+
# data sets in benchmarks without running out of memory
|
213
|
+
# TODO: Implement analysis and do this automatically
|
214
|
+
if KernelLauncher.debug_free_previous_input_immediately == true
|
215
|
+
for var in kernel_builder.previous_kernel_input
|
216
|
+
result = result + Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
|
217
|
+
"name" => var.name.to_s})
|
218
|
+
end
|
219
|
+
end
|
220
|
+
# ---- END DEBUG ONLY
|
221
|
+
|
222
|
+
cached_results.each do |result_id, type|
|
223
|
+
result = result + " " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id + " = " + Constants::RESULT_IDENTIFIER + result_id + ";\n"
|
224
|
+
end
|
225
|
+
|
226
|
+
return result
|
227
|
+
end
|
228
|
+
|
229
|
+
def build_device_memory_free
|
230
|
+
Log.info("Building kernel post-launch CUDA free")
|
231
|
+
|
232
|
+
assert_ready_to_build
|
233
|
+
|
234
|
+
if KernelLauncher.debug_free_previous_input_immediately == true
|
235
|
+
Log.warn("Debug flag set... Freeing input memory immediately and some memory not at all!")
|
236
|
+
return ""
|
237
|
+
end
|
238
|
+
|
239
|
+
return Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
|
240
|
+
"name" => kernel_result_var_name})
|
241
|
+
end
|
242
|
+
|
243
|
+
# Same as above, but also removes item from the list of allocated memory chunks.
|
244
|
+
def build_device_memory_free_in_host_section
|
245
|
+
Log.info("Building kernel post-launch CUDA free (host section")
|
246
|
+
|
247
|
+
assert_ready_to_build
|
248
|
+
|
249
|
+
return Translator.read_file(file_name: "host_section_free_device_memory.cpp", replacements: {
|
250
|
+
"name" => kernel_result_var_name})
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
require_relative "../kernel_builder"
|
258
|
+
require_relative "for_loop_kernel_launcher"
|
259
|
+
require_relative "while_loop_kernel_launcher"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Ikra
|
2
|
+
module Translator
|
3
|
+
class CommandTranslator
|
4
|
+
class WhileLoopKernelLauncher < KernelLauncher
|
5
|
+
def initialize(
|
6
|
+
kernel_builder:,
|
7
|
+
condition:,
|
8
|
+
before_loop: "",
|
9
|
+
post_iteration: "")
|
10
|
+
|
11
|
+
super(kernel_builder)
|
12
|
+
@condition = condition
|
13
|
+
@before_loop = before_loop
|
14
|
+
@post_iteration = post_iteration
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :condition
|
18
|
+
attr_reader :before_loop
|
19
|
+
attr_reader :post_iteration
|
20
|
+
|
21
|
+
def build_kernel_launcher
|
22
|
+
Log.info("Building for-loop kernel launcher")
|
23
|
+
|
24
|
+
assert_ready_to_build
|
25
|
+
|
26
|
+
result = ""
|
27
|
+
result = result + before_loop + "\n"
|
28
|
+
result = result + "while (#{condition}) {\n"
|
29
|
+
result = result + super
|
30
|
+
result = result + "\n" + post_iteration
|
31
|
+
result = result + "\n}\n"
|
32
|
+
|
33
|
+
return result
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -3,37 +3,42 @@ require_relative "../ast/visitor"
|
|
3
3
|
|
4
4
|
module Ikra
|
5
5
|
module Translator
|
6
|
+
# Visitor that replaces implicit returns with explicit ones
|
6
7
|
class LastStatementReturnsVisitor < AST::Visitor
|
7
8
|
def visit_root_node(node)
|
8
|
-
node.
|
9
|
+
node.single_child.accept(self)
|
9
10
|
end
|
10
11
|
|
11
12
|
def visit_lvar_read_node(node)
|
12
|
-
|
13
|
+
process_node(node)
|
13
14
|
end
|
14
15
|
|
15
16
|
def visit_lvar_write_node(node)
|
16
|
-
|
17
|
+
process_node(node)
|
17
18
|
end
|
18
19
|
|
19
20
|
def visit_int_node(node)
|
20
|
-
|
21
|
+
process_node(node)
|
21
22
|
end
|
22
23
|
|
23
24
|
def visit_float_node(node)
|
24
|
-
|
25
|
+
process_node(node)
|
25
26
|
end
|
26
27
|
|
27
28
|
def visit_bool_node(node)
|
28
|
-
|
29
|
+
process_node(node)
|
30
|
+
end
|
31
|
+
|
32
|
+
def visit_nil_node(node)
|
33
|
+
process_node(node)
|
29
34
|
end
|
30
35
|
|
31
36
|
def visit_for_node(node)
|
32
|
-
raise "Cannot handle for loop as return value"
|
37
|
+
raise NotImplementedError.new("Cannot handle for loop as return value")
|
33
38
|
end
|
34
39
|
|
35
40
|
def visit_break_node(node)
|
36
|
-
raise "Break must not be a return value"
|
41
|
+
raise AssertionError.new("Break must not be a return value")
|
37
42
|
end
|
38
43
|
|
39
44
|
def visit_if_node(node)
|
@@ -46,11 +51,15 @@ module Ikra
|
|
46
51
|
end
|
47
52
|
|
48
53
|
def visit_send_node(node)
|
49
|
-
|
54
|
+
process_node(node)
|
50
55
|
end
|
51
56
|
|
52
57
|
def visit_return_node(node)
|
53
|
-
|
58
|
+
# Do nothing
|
59
|
+
end
|
60
|
+
|
61
|
+
def process_node(node)
|
62
|
+
node.parent.replace_child(node, AST::ReturnNode.new(value: node))
|
54
63
|
end
|
55
64
|
end
|
56
65
|
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module Ikra
|
5
|
+
module Translator
|
6
|
+
class CommandTranslator
|
7
|
+
|
8
|
+
# Builds the entire CUDA program. A CUDA program may consist of multiple kernels, but
|
9
|
+
# has at least one kernel. The generated code performs the following steps:
|
10
|
+
#
|
11
|
+
# 1. Insert header of CUDA file.
|
12
|
+
# 2. For every kernel: Build all methods, blocks, and kernels.
|
13
|
+
# 3. Build the program entry point (including kernel launchers).
|
14
|
+
class ProgramBuilder
|
15
|
+
attr_reader :environment_builder
|
16
|
+
attr_reader :kernel_launchers
|
17
|
+
attr_reader :kernels
|
18
|
+
attr_reader :root_command
|
19
|
+
|
20
|
+
# An array of structs definitions ([Types::StructType] instances) that should be
|
21
|
+
# generated for this program.
|
22
|
+
attr_reader :structs
|
23
|
+
|
24
|
+
# An array of array command structs.
|
25
|
+
attr_reader :array_command_structs
|
26
|
+
|
27
|
+
def initialize(environment_builder:, root_command:)
|
28
|
+
@kernel_launchers = []
|
29
|
+
@kernels = Set.new([])
|
30
|
+
@environment_builder = environment_builder
|
31
|
+
@root_command = root_command
|
32
|
+
|
33
|
+
# The collection of structs is a [Set]. Struct types are unique, i.e., there
|
34
|
+
# are never two equal struct types with different object identity.
|
35
|
+
@structs = Set.new
|
36
|
+
@array_command_structs = Set.new
|
37
|
+
end
|
38
|
+
|
39
|
+
def add_array_command_struct(*structs)
|
40
|
+
for struct in structs
|
41
|
+
array_command_structs.add(struct)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def add_kernel_launcher(launcher)
|
46
|
+
@kernel_launchers.push(launcher)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Generates the source code for the CUDA program, compiles it with nvcc and
|
50
|
+
# executes the program.
|
51
|
+
def execute
|
52
|
+
source = build_program
|
53
|
+
|
54
|
+
launcher = Launcher.new(
|
55
|
+
source: source,
|
56
|
+
environment_builder: environment_builder,
|
57
|
+
result_type: result_type,
|
58
|
+
root_command: root_command)
|
59
|
+
|
60
|
+
launcher.compile
|
61
|
+
return launcher.execute
|
62
|
+
end
|
63
|
+
|
64
|
+
# Build kernel invocations
|
65
|
+
def build_kernel_launchers
|
66
|
+
return kernel_launchers.map do |launcher|
|
67
|
+
launcher.build_kernel_launcher
|
68
|
+
end.join("")
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def assert_ready_to_build
|
74
|
+
if kernel_launchers.size == 0
|
75
|
+
raise AssertionError.new(
|
76
|
+
"Not ready to build (ProgramBuilder): No kernel launcher defined")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Build header of CUDA source code file
|
81
|
+
def build_header
|
82
|
+
return Translator.read_file(file_name: "header.cpp")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Build environment struct definition
|
86
|
+
def build_environment_struct
|
87
|
+
return environment_builder.build_environment_struct
|
88
|
+
end
|
89
|
+
|
90
|
+
# Generate all struct types (except for array command struct types).
|
91
|
+
def build_struct_types
|
92
|
+
return structs.map do |struct_type|
|
93
|
+
struct_type.generate_definition
|
94
|
+
end.join("\n") + "\n"
|
95
|
+
end
|
96
|
+
|
97
|
+
def build_array_command_struct_types
|
98
|
+
return array_command_structs.to_a.join("\n") + "\n"
|
99
|
+
end
|
100
|
+
|
101
|
+
def all_kernel_builders
|
102
|
+
return kernel_launchers.map do |launcher|
|
103
|
+
launcher.kernel_builders
|
104
|
+
end.flatten
|
105
|
+
end
|
106
|
+
|
107
|
+
# Build methods, blocks and kernels
|
108
|
+
def build_kernels
|
109
|
+
result = ""
|
110
|
+
|
111
|
+
for builder in all_kernel_builders
|
112
|
+
# Check whether kernel was already build before
|
113
|
+
if kernels.include?(builder)
|
114
|
+
next
|
115
|
+
else
|
116
|
+
kernels.add(builder)
|
117
|
+
end
|
118
|
+
|
119
|
+
result = result + builder.build_methods
|
120
|
+
result = result + builder.build_blocks
|
121
|
+
result = result + builder.build_kernel
|
122
|
+
end
|
123
|
+
|
124
|
+
return result
|
125
|
+
end
|
126
|
+
|
127
|
+
def host_result_expression
|
128
|
+
# Read some fields from last kernel launch configuration
|
129
|
+
result_device_ptr = kernel_launchers.last.kernel_result_var_name
|
130
|
+
result_c_type = kernel_launchers.last.result_type.to_c_type
|
131
|
+
result_size = root_command.size
|
132
|
+
|
133
|
+
if result_device_ptr == nil
|
134
|
+
raise AssertionError.new(
|
135
|
+
"Result variable name of final kernel launcher not set")
|
136
|
+
end
|
137
|
+
|
138
|
+
# Build result values: `variable_size_array_t` struct. This struct contains a
|
139
|
+
# pointer to the result array and stores the size of the result.
|
140
|
+
result_device_variable_array_t = "variable_size_array_t((void *) #{result_device_ptr}, #{result_size})"
|
141
|
+
|
142
|
+
return Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
|
143
|
+
"type" => result_c_type,
|
144
|
+
"device_array" => result_device_variable_array_t})
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns the result type of this program. The result type must always be a
|
148
|
+
# union type that includes a [Types::LocationAwareArrayType] object,
|
149
|
+
# because this way we can support return types where the inner type of an array
|
150
|
+
# is unknown at compile time.
|
151
|
+
def result_type
|
152
|
+
return Types::LocationAwareVariableSizeArrayType.new(
|
153
|
+
kernel_launchers.last.result_type,
|
154
|
+
location: :host).to_union_type
|
155
|
+
end
|
156
|
+
|
157
|
+
# Free device memory
|
158
|
+
def build_memory_free
|
159
|
+
result = ""
|
160
|
+
|
161
|
+
for launcher in kernel_launchers
|
162
|
+
if !launcher.reuse_memory?
|
163
|
+
result = result + launcher.build_device_memory_free
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
return result
|
168
|
+
end
|
169
|
+
|
170
|
+
# Build the struct type for `result_t`.
|
171
|
+
def build_header_structs
|
172
|
+
header_structs = Translator.read_file(file_name: "header_structs.cpp",
|
173
|
+
replacements: {"result_type" => result_type.to_c_type})
|
174
|
+
end
|
175
|
+
|
176
|
+
# Builds the CUDA program. Returns the source code string.
|
177
|
+
def build_program
|
178
|
+
assert_ready_to_build
|
179
|
+
|
180
|
+
result = build_header + build_struct_types + build_header_structs +
|
181
|
+
build_array_command_struct_types + build_environment_struct +
|
182
|
+
build_kernels
|
183
|
+
|
184
|
+
# Build program entry point
|
185
|
+
return result + Translator.read_file(file_name: "entry_point.cpp", replacements: {
|
186
|
+
"prepare_environment" => environment_builder.build_environment_variable,
|
187
|
+
"launch_all_kernels" => build_kernel_launchers,
|
188
|
+
"free_device_memory" => build_memory_free,
|
189
|
+
"host_env_var_name" => Constants::ENV_HOST_IDENTIFIER,
|
190
|
+
"host_result_array" => host_result_expression})
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
require_relative "program_launcher"
|