ikra 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ast/builder.rb +225 -77
- data/lib/ast/host_section_builder.rb +38 -0
- data/lib/ast/interpreter.rb +67 -0
- data/lib/ast/lexical_variables_enumerator.rb +3 -2
- data/lib/ast/nodes.rb +521 -31
- data/lib/ast/printer.rb +116 -18
- data/lib/ast/ssa_generator.rb +192 -0
- data/lib/ast/visitor.rb +235 -21
- data/lib/config/configuration.rb +28 -3
- data/lib/config/os_configuration.rb +62 -9
- data/lib/cpu/cpu_implementation.rb +39 -0
- data/lib/ikra.rb +13 -3
- data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
- data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
- data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
- data/lib/resources/cuda/ast/assignment.cpp +1 -0
- data/lib/resources/cuda/block_function_head.cpp +7 -1
- data/lib/resources/cuda/entry_point.cpp +47 -0
- data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
- data/lib/resources/cuda/free_device_memory.cpp +3 -0
- data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
- data/lib/resources/cuda/header.cpp +23 -9
- data/lib/resources/cuda/header_structs.cpp +92 -0
- data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
- data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
- data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
- data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
- data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
- data/lib/resources/cuda/kernel.cpp +9 -2
- data/lib/resources/cuda/launch_kernel.cpp +5 -0
- data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
- data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
- data/lib/resources/cuda/reduce_body.cpp +88 -0
- data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
- data/lib/resources/cuda/stencil_body.cpp +16 -0
- data/lib/resources/cuda/struct_definition.cpp +4 -0
- data/lib/ruby_core/array.rb +34 -0
- data/lib/ruby_core/array_command.rb +313 -0
- data/lib/ruby_core/core.rb +103 -0
- data/lib/ruby_core/interpreter.rb +16 -0
- data/lib/ruby_core/math.rb +32 -0
- data/lib/ruby_core/ruby_integration.rb +256 -0
- data/lib/symbolic/host_section.rb +115 -0
- data/lib/symbolic/input.rb +87 -0
- data/lib/symbolic/input_visitor.rb +68 -0
- data/lib/symbolic/symbolic.rb +793 -117
- data/lib/symbolic/visitor.rb +70 -8
- data/lib/translator/array_command_struct_builder.rb +163 -0
- data/lib/translator/ast_translator.rb +572 -0
- data/lib/translator/block_translator.rb +104 -48
- data/lib/translator/commands/array_combine_command.rb +41 -0
- data/lib/translator/commands/array_identity_command.rb +28 -0
- data/lib/translator/commands/array_index_command.rb +52 -0
- data/lib/translator/commands/array_reduce_command.rb +135 -0
- data/lib/translator/commands/array_stencil_command.rb +129 -0
- data/lib/translator/commands/array_zip_command.rb +30 -0
- data/lib/translator/commands/command_translator.rb +264 -0
- data/lib/translator/cuda_errors.rb +32 -0
- data/lib/translator/environment_builder.rb +263 -0
- data/lib/translator/host_section/array_host_section_command.rb +150 -0
- data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
- data/lib/translator/host_section/ast_translator.rb +14 -0
- data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
- data/lib/translator/host_section/program_builder.rb +89 -0
- data/lib/translator/input_translator.rb +226 -0
- data/lib/translator/kernel_builder.rb +137 -0
- data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
- data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
- data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
- data/lib/translator/last_returns_visitor.rb +19 -10
- data/lib/translator/program_builder.rb +197 -0
- data/lib/translator/program_launcher.rb +273 -0
- data/lib/translator/struct_type.rb +55 -0
- data/lib/translator/translator.rb +34 -11
- data/lib/translator/variable_classifier_visitor.rb +56 -0
- data/lib/types/inference/ast_inference.rb +586 -0
- data/lib/types/inference/clear_types_visitor.rb +11 -0
- data/lib/types/inference/command_inference.rb +101 -0
- data/lib/types/inference/input_inference.rb +62 -0
- data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
- data/lib/types/inference/ruby_extension.rb +35 -0
- data/lib/types/inference/symbol_table.rb +131 -0
- data/lib/types/types.rb +14 -0
- data/lib/types/types/array_command_type.rb +123 -0
- data/lib/types/types/array_type.rb +137 -0
- data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
- data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
- data/lib/types/types/ruby_type.rb +88 -0
- data/lib/types/types/struct_type.rb +179 -0
- data/lib/types/types/union_type.rb +239 -0
- metadata +160 -18
- data/lib/ast/method_definition.rb +0 -37
- data/lib/ast/translator.rb +0 -264
- data/lib/resources/cuda/kernel_launcher.cpp +0 -28
- data/lib/scope.rb +0 -166
- data/lib/translator/command_translator.rb +0 -421
- data/lib/translator/local_variables_enumerator.rb +0 -35
- data/lib/translator/method_translator.rb +0 -24
- data/lib/types/array_type.rb +0 -51
- data/lib/types/ruby_extension.rb +0 -67
- data/lib/types/ruby_type.rb +0 -45
- data/lib/types/type_inference.rb +0 -382
- data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,40 @@
|
|
1
|
+
module Ikra
|
2
|
+
module Translator
|
3
|
+
class CommandTranslator
|
4
|
+
class ForLoopKernelLauncher < KernelLauncher
|
5
|
+
def initialize(
|
6
|
+
kernel_builder:,
|
7
|
+
from_expr: "0",
|
8
|
+
to_expr:,
|
9
|
+
var_name: "i",
|
10
|
+
before_loop: "")
|
11
|
+
|
12
|
+
super(kernel_builder)
|
13
|
+
@from_expr = from_expr
|
14
|
+
@to_expr = to_expr
|
15
|
+
@var_name = var_name
|
16
|
+
@before_loop = before_loop
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :from_expr
|
20
|
+
attr_reader :to_expr
|
21
|
+
attr_reader :var_name
|
22
|
+
attr_reader :before_loop
|
23
|
+
|
24
|
+
def build_kernel_launcher
|
25
|
+
Log.info("Building for-loop kernel launcher")
|
26
|
+
|
27
|
+
assert_ready_to_build
|
28
|
+
|
29
|
+
result = before_loop + "\n"
|
30
|
+
result = result + "for (int #{var_name} = #{from_expr}; #{var_name} < #{to_expr}; #{var_name} ++)\n{"
|
31
|
+
|
32
|
+
result = result + super
|
33
|
+
result = result + "\n}\n"
|
34
|
+
|
35
|
+
return result
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
module Ikra
|
2
|
+
module Translator
|
3
|
+
class CommandTranslator
|
4
|
+
|
5
|
+
# Builds the launch of the kernel. This class is responsible for generating the
|
6
|
+
# invocation of the kernel.
|
7
|
+
#
|
8
|
+
# For example:
|
9
|
+
# kernel<<<..., ...>>>(env, result, d_a, ...);
|
10
|
+
class KernelLauncher
|
11
|
+
class << self
|
12
|
+
# Debug flag only: Frees all input after launching kernel. This causes an
|
13
|
+
# error if data is used twice or kept (using the `keep` flag)
|
14
|
+
attr_accessor :debug_free_previous_input_immediately
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :kernel_builder
|
18
|
+
|
19
|
+
# Additional parameters that this kernel should accept (to access the result
|
20
|
+
# of previous kernels)
|
21
|
+
attr_accessor :previous_kernel_input
|
22
|
+
|
23
|
+
# Additional parameters that this kernel should accept (to access the result
|
24
|
+
# of previous kernels)
|
25
|
+
attr_accessor :additional_arguments
|
26
|
+
|
27
|
+
# Number of threads (elements to be processed)
|
28
|
+
attr_accessor :num_threads
|
29
|
+
|
30
|
+
# Block/grid dimensions (should be 1D)
|
31
|
+
attr_accessor :grid_dim
|
32
|
+
attr_accessor :block_dim
|
33
|
+
|
34
|
+
# Whether the launch allocates new memory beforehand or uses previous memory
|
35
|
+
attr_accessor :reuse_memory
|
36
|
+
|
37
|
+
# Pointer to the resulting array (device memory)
|
38
|
+
attr_reader :kernel_result_var_name
|
39
|
+
|
40
|
+
# IDs and types of commands whose results are kept on the GPU
|
41
|
+
attr_accessor :cached_results
|
42
|
+
|
43
|
+
# IDs and types of commands that were previously computed and shall now be used in this kernel as input
|
44
|
+
attr_reader :previously_cached_results
|
45
|
+
|
46
|
+
def initialize(kernel_builder)
|
47
|
+
@kernel_builder = kernel_builder
|
48
|
+
@additional_arguments = []
|
49
|
+
@previous_kernel_input = []
|
50
|
+
@reuse_memory = false
|
51
|
+
@kernel_result_var_name = "_kernel_result_" + CommandTranslator.next_unique_id.to_s
|
52
|
+
@cached_results = {}
|
53
|
+
@previously_cached_results = {}
|
54
|
+
end
|
55
|
+
|
56
|
+
# Some of the values stored in `@additional_arguments` might be blocks, because
|
57
|
+
# not all information was known when adding something to that list. This method
|
58
|
+
# replaces those blocks (evaluates them) with actual strings, based on the command
|
59
|
+
# that is being launched.
|
60
|
+
def prepare_additional_args_for_launch(command)
|
61
|
+
@additional_arguments = @additional_arguments.map do |arg|
|
62
|
+
if arg.is_a?(String)
|
63
|
+
arg
|
64
|
+
else
|
65
|
+
arg.call(command)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def kernel_builders
|
71
|
+
# The program builder accesses kernel builders via kernel launchers through
|
72
|
+
# this method, because some specialized launchers might have multiple kernel
|
73
|
+
# builders.
|
74
|
+
return [kernel_builder]
|
75
|
+
end
|
76
|
+
|
77
|
+
# Adds command whose result will be kept on GPU
|
78
|
+
def add_cached_result(result_id, type)
|
79
|
+
@cached_results[result_id] = type
|
80
|
+
end
|
81
|
+
|
82
|
+
# Adds a previously computed result which will be used in this launche as input
|
83
|
+
def use_cached_result(result_id, type)
|
84
|
+
@previously_cached_results[result_id] = type
|
85
|
+
end
|
86
|
+
|
87
|
+
def reuse_memory!(parameter_name)
|
88
|
+
@reuse_memory = true
|
89
|
+
@kernel_result_var_name = parameter_name
|
90
|
+
end
|
91
|
+
|
92
|
+
def reuse_memory?
|
93
|
+
return @reuse_memory
|
94
|
+
end
|
95
|
+
|
96
|
+
def add_previous_kernel_parameter(parameter)
|
97
|
+
kernel_builder.add_previous_kernel_parameter(parameter)
|
98
|
+
end
|
99
|
+
|
100
|
+
# Add additional arguments to the kernel function that might be needed for some computations
|
101
|
+
def add_additional_arguments(*arguments)
|
102
|
+
@additional_arguments.push(*arguments)
|
103
|
+
end
|
104
|
+
|
105
|
+
# The result type of this kernel launcher. Same as the result type of its kernel
|
106
|
+
# builder.
|
107
|
+
def result_type
|
108
|
+
return kernel_builder.result_type
|
109
|
+
end
|
110
|
+
|
111
|
+
# The size of the result array is the number of threads.
|
112
|
+
def result_size
|
113
|
+
return num_threads
|
114
|
+
end
|
115
|
+
|
116
|
+
# Configures grid size and block size. Also sets number of threads.
|
117
|
+
def configure_grid(size, block_size: 256)
|
118
|
+
if block_size == nil
|
119
|
+
block_size = 256
|
120
|
+
end
|
121
|
+
|
122
|
+
if size.is_a?(Fixnum)
|
123
|
+
# Precompute constants
|
124
|
+
@grid_dim = [size.fdiv(block_size).ceil, 1].max.to_s
|
125
|
+
@block_dim = (size >= block_size ? block_size : size).to_s
|
126
|
+
@num_threads = size
|
127
|
+
else
|
128
|
+
if !size.is_a?(String)
|
129
|
+
raise AssertionError.new("Fixnum or String expected")
|
130
|
+
end
|
131
|
+
|
132
|
+
# Source code string determines the size
|
133
|
+
@grid_dim = "max((int) ceil(((float) #{size}) / #{block_size}), 1)"
|
134
|
+
@block_dim = "(#{size} >= #{block_size} ? #{block_size} : #{size})"
|
135
|
+
@num_threads = size
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def assert_ready_to_build
|
140
|
+
required_values = [:num_threads, :grid_dim, :block_dim]
|
141
|
+
|
142
|
+
for selector in required_values
|
143
|
+
if send(selector) == nil
|
144
|
+
raise AssertionError.new(
|
145
|
+
"Not ready to build (KernelBuilder): #{selector} is not set")
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
# Build the code that launches this kernel. The generated code performs the
|
152
|
+
# following steps:
|
153
|
+
#
|
154
|
+
# 1. Allocate device memory for the result.
|
155
|
+
# 2. If result should be written back: Allocate host memory for the result.
|
156
|
+
# 3. Launch the kernel (+ error checking, synchronization)
|
157
|
+
# 4. If result should be written back: Copy result back to host memory.
|
158
|
+
def build_kernel_launcher
|
159
|
+
|
160
|
+
Log.info("Building kernel launcher")
|
161
|
+
|
162
|
+
assert_ready_to_build
|
163
|
+
|
164
|
+
result = ""
|
165
|
+
if !reuse_memory
|
166
|
+
# Allocate device memory for kernel result
|
167
|
+
result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
|
168
|
+
"name" => kernel_result_var_name,
|
169
|
+
"bytes" => "(sizeof(#{kernel_builder.result_type.to_c_type}) * #{num_threads})",
|
170
|
+
"type" => kernel_builder.result_type.to_c_type})
|
171
|
+
end
|
172
|
+
|
173
|
+
previously_cached_results.each do |result_id, type|
|
174
|
+
result = result + " #{type.to_c_type} *prev_" + result_id.to_s + " = (#{type.to_c_type} *) " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id.to_s + ";\n"
|
175
|
+
end
|
176
|
+
|
177
|
+
# Allocate device memory for cached results
|
178
|
+
cached_results.each do |result_id, type|
|
179
|
+
result = result + Translator.read_file(file_name: "allocate_device_memory.cpp", replacements: {
|
180
|
+
"name" => Constants::RESULT_IDENTIFIER + result_id,
|
181
|
+
"bytes" => "(#{type.c_size} * #{num_threads})",
|
182
|
+
"type" => type.to_c_type})
|
183
|
+
end
|
184
|
+
|
185
|
+
# Build arguments
|
186
|
+
a_env = Constants::ENV_DEVICE_IDENTIFIER
|
187
|
+
a_result = kernel_result_var_name
|
188
|
+
|
189
|
+
previous_kernel_args = []
|
190
|
+
for var in kernel_builder.previous_kernel_input
|
191
|
+
previous_kernel_args.push(var.name.to_s)
|
192
|
+
end
|
193
|
+
|
194
|
+
a_cached_results = cached_results.map do |result_id, type|
|
195
|
+
Constants::RESULT_IDENTIFIER + result_id
|
196
|
+
end
|
197
|
+
|
198
|
+
if reuse_memory
|
199
|
+
previous_kernel_args[0] = a_result
|
200
|
+
end
|
201
|
+
|
202
|
+
arguments = ([a_env, num_threads, a_result] + a_cached_results + previous_kernel_args + additional_arguments).join(", ")
|
203
|
+
|
204
|
+
# Launch kernel
|
205
|
+
result = result + Translator.read_file(file_name: "launch_kernel.cpp", replacements: {
|
206
|
+
"kernel_name" => kernel_builder.kernel_name,
|
207
|
+
"arguments" => arguments,
|
208
|
+
"grid_dim" => grid_dim,
|
209
|
+
"block_dim" => block_dim})
|
210
|
+
|
211
|
+
# ---- DEBUG ONLY: Free input after computation so that we can process larger
|
212
|
+
# data sets in benchmarks without running out of memory
|
213
|
+
# TODO: Implement analysis and do this automatically
|
214
|
+
if KernelLauncher.debug_free_previous_input_immediately == true
|
215
|
+
for var in kernel_builder.previous_kernel_input
|
216
|
+
result = result + Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
|
217
|
+
"name" => var.name.to_s})
|
218
|
+
end
|
219
|
+
end
|
220
|
+
# ---- END DEBUG ONLY
|
221
|
+
|
222
|
+
cached_results.each do |result_id, type|
|
223
|
+
result = result + " " + Constants::ENV_HOST_IDENTIFIER + "->prev_" + result_id + " = " + Constants::RESULT_IDENTIFIER + result_id + ";\n"
|
224
|
+
end
|
225
|
+
|
226
|
+
return result
|
227
|
+
end
|
228
|
+
|
229
|
+
def build_device_memory_free
|
230
|
+
Log.info("Building kernel post-launch CUDA free")
|
231
|
+
|
232
|
+
assert_ready_to_build
|
233
|
+
|
234
|
+
if KernelLauncher.debug_free_previous_input_immediately == true
|
235
|
+
Log.warn("Debug flag set... Freeing input memory immediately and some memory not at all!")
|
236
|
+
return ""
|
237
|
+
end
|
238
|
+
|
239
|
+
return Translator.read_file(file_name: "free_device_memory.cpp", replacements: {
|
240
|
+
"name" => kernel_result_var_name})
|
241
|
+
end
|
242
|
+
|
243
|
+
# Same as above, but also removes item from the list of allocated memory chunks.
|
244
|
+
def build_device_memory_free_in_host_section
|
245
|
+
Log.info("Building kernel post-launch CUDA free (host section")
|
246
|
+
|
247
|
+
assert_ready_to_build
|
248
|
+
|
249
|
+
return Translator.read_file(file_name: "host_section_free_device_memory.cpp", replacements: {
|
250
|
+
"name" => kernel_result_var_name})
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
require_relative "../kernel_builder"
|
258
|
+
require_relative "for_loop_kernel_launcher"
|
259
|
+
require_relative "while_loop_kernel_launcher"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Ikra
|
2
|
+
module Translator
|
3
|
+
class CommandTranslator
|
4
|
+
class WhileLoopKernelLauncher < KernelLauncher
|
5
|
+
def initialize(
|
6
|
+
kernel_builder:,
|
7
|
+
condition:,
|
8
|
+
before_loop: "",
|
9
|
+
post_iteration: "")
|
10
|
+
|
11
|
+
super(kernel_builder)
|
12
|
+
@condition = condition
|
13
|
+
@before_loop = before_loop
|
14
|
+
@post_iteration = post_iteration
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :condition
|
18
|
+
attr_reader :before_loop
|
19
|
+
attr_reader :post_iteration
|
20
|
+
|
21
|
+
def build_kernel_launcher
|
22
|
+
Log.info("Building for-loop kernel launcher")
|
23
|
+
|
24
|
+
assert_ready_to_build
|
25
|
+
|
26
|
+
result = ""
|
27
|
+
result = result + before_loop + "\n"
|
28
|
+
result = result + "while (#{condition}) {\n"
|
29
|
+
result = result + super
|
30
|
+
result = result + "\n" + post_iteration
|
31
|
+
result = result + "\n}\n"
|
32
|
+
|
33
|
+
return result
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -3,37 +3,42 @@ require_relative "../ast/visitor"
|
|
3
3
|
|
4
4
|
module Ikra
|
5
5
|
module Translator
|
6
|
+
# Visitor that replaces implicit returns with explicit ones
|
6
7
|
class LastStatementReturnsVisitor < AST::Visitor
|
7
8
|
def visit_root_node(node)
|
8
|
-
node.
|
9
|
+
node.single_child.accept(self)
|
9
10
|
end
|
10
11
|
|
11
12
|
def visit_lvar_read_node(node)
|
12
|
-
|
13
|
+
process_node(node)
|
13
14
|
end
|
14
15
|
|
15
16
|
def visit_lvar_write_node(node)
|
16
|
-
|
17
|
+
process_node(node)
|
17
18
|
end
|
18
19
|
|
19
20
|
def visit_int_node(node)
|
20
|
-
|
21
|
+
process_node(node)
|
21
22
|
end
|
22
23
|
|
23
24
|
def visit_float_node(node)
|
24
|
-
|
25
|
+
process_node(node)
|
25
26
|
end
|
26
27
|
|
27
28
|
def visit_bool_node(node)
|
28
|
-
|
29
|
+
process_node(node)
|
30
|
+
end
|
31
|
+
|
32
|
+
def visit_nil_node(node)
|
33
|
+
process_node(node)
|
29
34
|
end
|
30
35
|
|
31
36
|
def visit_for_node(node)
|
32
|
-
raise "Cannot handle for loop as return value"
|
37
|
+
raise NotImplementedError.new("Cannot handle for loop as return value")
|
33
38
|
end
|
34
39
|
|
35
40
|
def visit_break_node(node)
|
36
|
-
raise "Break must not be a return value"
|
41
|
+
raise AssertionError.new("Break must not be a return value")
|
37
42
|
end
|
38
43
|
|
39
44
|
def visit_if_node(node)
|
@@ -46,11 +51,15 @@ module Ikra
|
|
46
51
|
end
|
47
52
|
|
48
53
|
def visit_send_node(node)
|
49
|
-
|
54
|
+
process_node(node)
|
50
55
|
end
|
51
56
|
|
52
57
|
def visit_return_node(node)
|
53
|
-
|
58
|
+
# Do nothing
|
59
|
+
end
|
60
|
+
|
61
|
+
def process_node(node)
|
62
|
+
node.parent.replace_child(node, AST::ReturnNode.new(value: node))
|
54
63
|
end
|
55
64
|
end
|
56
65
|
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module Ikra
|
5
|
+
module Translator
|
6
|
+
class CommandTranslator
|
7
|
+
|
8
|
+
# Builds the entire CUDA program. A CUDA program may consist of multiple kernels, but
|
9
|
+
# has at least one kernel. The generated code performs the following steps:
|
10
|
+
#
|
11
|
+
# 1. Insert header of CUDA file.
|
12
|
+
# 2. For every kernel: Build all methods, blocks, and kernels.
|
13
|
+
# 3. Build the program entry point (including kernel launchers).
|
14
|
+
class ProgramBuilder
|
15
|
+
attr_reader :environment_builder
|
16
|
+
attr_reader :kernel_launchers
|
17
|
+
attr_reader :kernels
|
18
|
+
attr_reader :root_command
|
19
|
+
|
20
|
+
# An array of structs definitions ([Types::StructType] instances) that should be
|
21
|
+
# generated for this program.
|
22
|
+
attr_reader :structs
|
23
|
+
|
24
|
+
# An array of array command structs.
|
25
|
+
attr_reader :array_command_structs
|
26
|
+
|
27
|
+
def initialize(environment_builder:, root_command:)
|
28
|
+
@kernel_launchers = []
|
29
|
+
@kernels = Set.new([])
|
30
|
+
@environment_builder = environment_builder
|
31
|
+
@root_command = root_command
|
32
|
+
|
33
|
+
# The collection of structs is a [Set]. Struct types are unique, i.e., there
|
34
|
+
# are never two equal struct types with different object identity.
|
35
|
+
@structs = Set.new
|
36
|
+
@array_command_structs = Set.new
|
37
|
+
end
|
38
|
+
|
39
|
+
def add_array_command_struct(*structs)
|
40
|
+
for struct in structs
|
41
|
+
array_command_structs.add(struct)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def add_kernel_launcher(launcher)
|
46
|
+
@kernel_launchers.push(launcher)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Generates the source code for the CUDA program, compiles it with nvcc and
|
50
|
+
# executes the program.
|
51
|
+
def execute
|
52
|
+
source = build_program
|
53
|
+
|
54
|
+
launcher = Launcher.new(
|
55
|
+
source: source,
|
56
|
+
environment_builder: environment_builder,
|
57
|
+
result_type: result_type,
|
58
|
+
root_command: root_command)
|
59
|
+
|
60
|
+
launcher.compile
|
61
|
+
return launcher.execute
|
62
|
+
end
|
63
|
+
|
64
|
+
# Build kernel invocations
|
65
|
+
def build_kernel_launchers
|
66
|
+
return kernel_launchers.map do |launcher|
|
67
|
+
launcher.build_kernel_launcher
|
68
|
+
end.join("")
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def assert_ready_to_build
|
74
|
+
if kernel_launchers.size == 0
|
75
|
+
raise AssertionError.new(
|
76
|
+
"Not ready to build (ProgramBuilder): No kernel launcher defined")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Build header of CUDA source code file
|
81
|
+
def build_header
|
82
|
+
return Translator.read_file(file_name: "header.cpp")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Build environment struct definition
|
86
|
+
def build_environment_struct
|
87
|
+
return environment_builder.build_environment_struct
|
88
|
+
end
|
89
|
+
|
90
|
+
# Generate all struct types (except for array command struct types).
|
91
|
+
def build_struct_types
|
92
|
+
return structs.map do |struct_type|
|
93
|
+
struct_type.generate_definition
|
94
|
+
end.join("\n") + "\n"
|
95
|
+
end
|
96
|
+
|
97
|
+
def build_array_command_struct_types
|
98
|
+
return array_command_structs.to_a.join("\n") + "\n"
|
99
|
+
end
|
100
|
+
|
101
|
+
def all_kernel_builders
|
102
|
+
return kernel_launchers.map do |launcher|
|
103
|
+
launcher.kernel_builders
|
104
|
+
end.flatten
|
105
|
+
end
|
106
|
+
|
107
|
+
# Build methods, blocks and kernels
|
108
|
+
def build_kernels
|
109
|
+
result = ""
|
110
|
+
|
111
|
+
for builder in all_kernel_builders
|
112
|
+
# Check whether kernel was already build before
|
113
|
+
if kernels.include?(builder)
|
114
|
+
next
|
115
|
+
else
|
116
|
+
kernels.add(builder)
|
117
|
+
end
|
118
|
+
|
119
|
+
result = result + builder.build_methods
|
120
|
+
result = result + builder.build_blocks
|
121
|
+
result = result + builder.build_kernel
|
122
|
+
end
|
123
|
+
|
124
|
+
return result
|
125
|
+
end
|
126
|
+
|
127
|
+
def host_result_expression
|
128
|
+
# Read some fields from last kernel launch configuration
|
129
|
+
result_device_ptr = kernel_launchers.last.kernel_result_var_name
|
130
|
+
result_c_type = kernel_launchers.last.result_type.to_c_type
|
131
|
+
result_size = root_command.size
|
132
|
+
|
133
|
+
if result_device_ptr == nil
|
134
|
+
raise AssertionError.new(
|
135
|
+
"Result variable name of final kernel launcher not set")
|
136
|
+
end
|
137
|
+
|
138
|
+
# Build result values: `variable_size_array_t` struct. This struct contains a
|
139
|
+
# pointer to the result array and stores the size of the result.
|
140
|
+
result_device_variable_array_t = "variable_size_array_t((void *) #{result_device_ptr}, #{result_size})"
|
141
|
+
|
142
|
+
return Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
|
143
|
+
"type" => result_c_type,
|
144
|
+
"device_array" => result_device_variable_array_t})
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns the result type of this program. The result type must always be a
|
148
|
+
# union type that includes a [Types::LocationAwareArrayType] object,
|
149
|
+
# because this way we can support return types where the inner type of an array
|
150
|
+
# is unknown at compile time.
|
151
|
+
def result_type
|
152
|
+
return Types::LocationAwareVariableSizeArrayType.new(
|
153
|
+
kernel_launchers.last.result_type,
|
154
|
+
location: :host).to_union_type
|
155
|
+
end
|
156
|
+
|
157
|
+
# Free device memory
|
158
|
+
def build_memory_free
|
159
|
+
result = ""
|
160
|
+
|
161
|
+
for launcher in kernel_launchers
|
162
|
+
if !launcher.reuse_memory?
|
163
|
+
result = result + launcher.build_device_memory_free
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
return result
|
168
|
+
end
|
169
|
+
|
170
|
+
# Build the struct type for `result_t`.
|
171
|
+
def build_header_structs
|
172
|
+
header_structs = Translator.read_file(file_name: "header_structs.cpp",
|
173
|
+
replacements: {"result_type" => result_type.to_c_type})
|
174
|
+
end
|
175
|
+
|
176
|
+
# Builds the CUDA program. Returns the source code string.
|
177
|
+
def build_program
|
178
|
+
assert_ready_to_build
|
179
|
+
|
180
|
+
result = build_header + build_struct_types + build_header_structs +
|
181
|
+
build_array_command_struct_types + build_environment_struct +
|
182
|
+
build_kernels
|
183
|
+
|
184
|
+
# Build program entry point
|
185
|
+
return result + Translator.read_file(file_name: "entry_point.cpp", replacements: {
|
186
|
+
"prepare_environment" => environment_builder.build_environment_variable,
|
187
|
+
"launch_all_kernels" => build_kernel_launchers,
|
188
|
+
"free_device_memory" => build_memory_free,
|
189
|
+
"host_env_var_name" => Constants::ENV_HOST_IDENTIFIER,
|
190
|
+
"host_result_array" => host_result_expression})
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
require_relative "program_launcher"
|