ikra 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/ast/builder.rb +225 -77
- data/lib/ast/host_section_builder.rb +38 -0
- data/lib/ast/interpreter.rb +67 -0
- data/lib/ast/lexical_variables_enumerator.rb +3 -2
- data/lib/ast/nodes.rb +521 -31
- data/lib/ast/printer.rb +116 -18
- data/lib/ast/ssa_generator.rb +192 -0
- data/lib/ast/visitor.rb +235 -21
- data/lib/config/configuration.rb +28 -3
- data/lib/config/os_configuration.rb +62 -9
- data/lib/cpu/cpu_implementation.rb +39 -0
- data/lib/ikra.rb +13 -3
- data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
- data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
- data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
- data/lib/resources/cuda/ast/assignment.cpp +1 -0
- data/lib/resources/cuda/block_function_head.cpp +7 -1
- data/lib/resources/cuda/entry_point.cpp +47 -0
- data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
- data/lib/resources/cuda/free_device_memory.cpp +3 -0
- data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
- data/lib/resources/cuda/header.cpp +23 -9
- data/lib/resources/cuda/header_structs.cpp +92 -0
- data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
- data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
- data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
- data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
- data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
- data/lib/resources/cuda/kernel.cpp +9 -2
- data/lib/resources/cuda/launch_kernel.cpp +5 -0
- data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
- data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
- data/lib/resources/cuda/reduce_body.cpp +88 -0
- data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
- data/lib/resources/cuda/stencil_body.cpp +16 -0
- data/lib/resources/cuda/struct_definition.cpp +4 -0
- data/lib/ruby_core/array.rb +34 -0
- data/lib/ruby_core/array_command.rb +313 -0
- data/lib/ruby_core/core.rb +103 -0
- data/lib/ruby_core/interpreter.rb +16 -0
- data/lib/ruby_core/math.rb +32 -0
- data/lib/ruby_core/ruby_integration.rb +256 -0
- data/lib/symbolic/host_section.rb +115 -0
- data/lib/symbolic/input.rb +87 -0
- data/lib/symbolic/input_visitor.rb +68 -0
- data/lib/symbolic/symbolic.rb +793 -117
- data/lib/symbolic/visitor.rb +70 -8
- data/lib/translator/array_command_struct_builder.rb +163 -0
- data/lib/translator/ast_translator.rb +572 -0
- data/lib/translator/block_translator.rb +104 -48
- data/lib/translator/commands/array_combine_command.rb +41 -0
- data/lib/translator/commands/array_identity_command.rb +28 -0
- data/lib/translator/commands/array_index_command.rb +52 -0
- data/lib/translator/commands/array_reduce_command.rb +135 -0
- data/lib/translator/commands/array_stencil_command.rb +129 -0
- data/lib/translator/commands/array_zip_command.rb +30 -0
- data/lib/translator/commands/command_translator.rb +264 -0
- data/lib/translator/cuda_errors.rb +32 -0
- data/lib/translator/environment_builder.rb +263 -0
- data/lib/translator/host_section/array_host_section_command.rb +150 -0
- data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
- data/lib/translator/host_section/ast_translator.rb +14 -0
- data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
- data/lib/translator/host_section/program_builder.rb +89 -0
- data/lib/translator/input_translator.rb +226 -0
- data/lib/translator/kernel_builder.rb +137 -0
- data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
- data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
- data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
- data/lib/translator/last_returns_visitor.rb +19 -10
- data/lib/translator/program_builder.rb +197 -0
- data/lib/translator/program_launcher.rb +273 -0
- data/lib/translator/struct_type.rb +55 -0
- data/lib/translator/translator.rb +34 -11
- data/lib/translator/variable_classifier_visitor.rb +56 -0
- data/lib/types/inference/ast_inference.rb +586 -0
- data/lib/types/inference/clear_types_visitor.rb +11 -0
- data/lib/types/inference/command_inference.rb +101 -0
- data/lib/types/inference/input_inference.rb +62 -0
- data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
- data/lib/types/inference/ruby_extension.rb +35 -0
- data/lib/types/inference/symbol_table.rb +131 -0
- data/lib/types/types.rb +14 -0
- data/lib/types/types/array_command_type.rb +123 -0
- data/lib/types/types/array_type.rb +137 -0
- data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
- data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
- data/lib/types/types/ruby_type.rb +88 -0
- data/lib/types/types/struct_type.rb +179 -0
- data/lib/types/types/union_type.rb +239 -0
- metadata +160 -18
- data/lib/ast/method_definition.rb +0 -37
- data/lib/ast/translator.rb +0 -264
- data/lib/resources/cuda/kernel_launcher.cpp +0 -28
- data/lib/scope.rb +0 -166
- data/lib/translator/command_translator.rb +0 -421
- data/lib/translator/local_variables_enumerator.rb +0 -35
- data/lib/translator/method_translator.rb +0 -24
- data/lib/types/array_type.rb +0 -51
- data/lib/types/ruby_extension.rb +0 -67
- data/lib/types/ruby_type.rb +0 -45
- data/lib/types/type_inference.rb +0 -382
- data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,12 @@
|
|
1
|
+
#undef checkErrorReturn
|
2
|
+
#define checkErrorReturn(result_var, expr) \
|
3
|
+
if (result_var->last_error = expr) \
|
4
|
+
{\
|
5
|
+
cudaError_t error = cudaGetLastError();\
|
6
|
+
printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
|
7
|
+
cudaDeviceReset();\
|
8
|
+
return /*{result_type}*/::error_return_value;\
|
9
|
+
}
|
10
|
+
|
11
|
+
/*{result_type}*/ /*{name}*/(/*{parameters}*/)
|
12
|
+
/*{body}*/
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#undef checkErrorReturn
|
2
|
+
#define checkErrorReturn(result_var, expr) \
|
3
|
+
if (result_var->last_error = expr) \
|
4
|
+
{\
|
5
|
+
cudaError_t error = cudaGetLastError();\
|
6
|
+
printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
|
7
|
+
cudaDeviceReset();\
|
8
|
+
return result_var;\
|
9
|
+
}
|
10
|
+
|
11
|
+
extern "C" EXPORT result_t *launch_kernel(environment_t */*{host_env_var_name}*/)
|
12
|
+
{
|
13
|
+
// CUDA Initialization
|
14
|
+
program_result = new result_t();
|
15
|
+
program_result->device_allocations = new vector<void*>();
|
16
|
+
|
17
|
+
timeStartMeasure();
|
18
|
+
|
19
|
+
cudaError_t cudaStatus = cudaSetDevice(0);
|
20
|
+
|
21
|
+
if (cudaStatus != cudaSuccess) {
|
22
|
+
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
|
23
|
+
program_result->last_error = -1;
|
24
|
+
return program_result;
|
25
|
+
}
|
26
|
+
|
27
|
+
checkErrorReturn(program_result, cudaFree(0));
|
28
|
+
|
29
|
+
timeReportMeasure(program_result, setup_cuda);
|
30
|
+
|
31
|
+
|
32
|
+
/* Prepare environment */
|
33
|
+
/*{prepare_environment}*/
|
34
|
+
|
35
|
+
|
36
|
+
/* Copy back memory and set pointer of result */
|
37
|
+
program_result->result = /*{host_result_array}*/;
|
38
|
+
|
39
|
+
/* Free device memory */
|
40
|
+
timeStartMeasure();
|
41
|
+
|
42
|
+
for (
|
43
|
+
auto device_ptr = program_result->device_allocations->begin();
|
44
|
+
device_ptr < program_result->device_allocations->end();
|
45
|
+
device_ptr++)
|
46
|
+
{
|
47
|
+
checkErrorReturn(program_result, cudaFree(*device_ptr));
|
48
|
+
}
|
49
|
+
|
50
|
+
delete program_result->device_allocations;
|
51
|
+
|
52
|
+
timeReportMeasure(program_result, free_memory);
|
53
|
+
|
54
|
+
return program_result;
|
55
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
timeStartMeasure();
|
2
|
+
|
3
|
+
if (/*{name}*/ != cmd->result) {
|
4
|
+
// Don't free memory if it is the result. There is already a similar check in
|
5
|
+
// program_builder (free all except for last). However, this check is not sufficient in
|
6
|
+
// case the same array is reused!
|
7
|
+
|
8
|
+
checkErrorReturn(program_result, cudaFree(/*{name}*/));
|
9
|
+
// Remove from list of allocations
|
10
|
+
program_result->device_allocations->erase(
|
11
|
+
std::remove(
|
12
|
+
program_result->device_allocations->begin(),
|
13
|
+
program_result->device_allocations->end(),
|
14
|
+
/*{name}*/),
|
15
|
+
program_result->device_allocations->end());
|
16
|
+
}
|
17
|
+
|
18
|
+
timeReportMeasure(program_result, free_memory);
|
@@ -0,0 +1,14 @@
|
|
1
|
+
({
|
2
|
+
// /*{debug_information}*/
|
3
|
+
|
4
|
+
/*{array_command_type}*/ cmd = /*{array_command}*/;
|
5
|
+
|
6
|
+
if (cmd->result == 0) {
|
7
|
+
/*{kernel_invocation}*/
|
8
|
+
cmd->result = /*{kernel_result}*/;
|
9
|
+
|
10
|
+
/*{free_memory}*/
|
11
|
+
}
|
12
|
+
|
13
|
+
variable_size_array_t((void *) cmd->result, /*{result_size}*/);
|
14
|
+
})
|
@@ -0,0 +1,10 @@
|
|
1
|
+
|
2
|
+
{
|
3
|
+
/*{type}*/ * tmp_result = (/*{type}*/ *) malloc(/*{bytes}*/);
|
4
|
+
|
5
|
+
timeStartMeasure();
|
6
|
+
checkErrorReturn(program_result, cudaMemcpy(tmp_result, program_result->result, /*{bytes}*/, cudaMemcpyDeviceToHost));
|
7
|
+
timeReportMeasure(program_result, transfer_memory);
|
8
|
+
|
9
|
+
program_result->result = tmp_result;
|
10
|
+
}
|
@@ -1,8 +1,15 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
__global__ void
|
3
|
+
__global__ void /*{kernel_name}*/(/*{parameters}*/)
|
4
4
|
{
|
5
|
-
|
5
|
+
int _tid_ = threadIdx.x + blockIdx.x * blockDim.x;
|
6
|
+
|
7
|
+
if (_tid_ < /*{num_threads}*/)
|
8
|
+
{
|
9
|
+
/*{execution}*/
|
10
|
+
|
11
|
+
_result_[_tid_] = /*{block_invocation}*/;
|
12
|
+
}
|
6
13
|
}
|
7
14
|
|
8
15
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
({
|
2
|
+
variable_size_array_t device_array = /*{device_array}*/;
|
3
|
+
/*{type}*/ * tmp_result = (/*{type}*/ *) malloc(sizeof(/*{type}*/) * device_array.size);
|
4
|
+
|
5
|
+
timeStartMeasure();
|
6
|
+
checkErrorReturn(program_result, cudaMemcpy(tmp_result, device_array.content, sizeof(/*{type}*/) * device_array.size, cudaMemcpyDeviceToHost));
|
7
|
+
timeReportMeasure(program_result, transfer_memory);
|
8
|
+
|
9
|
+
variable_size_array_t((void *) tmp_result, device_array.size);
|
10
|
+
})
|
@@ -0,0 +1,88 @@
|
|
1
|
+
int thread_idx = threadIdx.x;
|
2
|
+
|
3
|
+
// Single result of this block
|
4
|
+
/*{type}*/ /*{temp_result}*/;
|
5
|
+
|
6
|
+
int num_args = 2 * /*{block_size}*/;
|
7
|
+
if (blockIdx.x == gridDim.x - 1)
|
8
|
+
{
|
9
|
+
// Processing the last block, which might be odd (number of elements to reduce).
|
10
|
+
// Other blocks cannot be "odd", because every block reduces 2*block_size many elements.
|
11
|
+
|
12
|
+
// Number of elements to reduce in the last block
|
13
|
+
num_args = ((2 * /*{num_threads}*/ - 1) % (2 * /*{block_size}*/)) + (/*{odd}*/ ? 0 : 1);
|
14
|
+
}
|
15
|
+
|
16
|
+
if (num_args == 1)
|
17
|
+
{
|
18
|
+
/*{temp_result}*/ = /*{previous_result}*/[_tid_];
|
19
|
+
}
|
20
|
+
else if (num_args == 2)
|
21
|
+
{
|
22
|
+
/*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
|
23
|
+
}
|
24
|
+
else
|
25
|
+
{
|
26
|
+
// Allocate block_size many slots to contain the result of up to block_size many reductions, i.e.,
|
27
|
+
// this array contains the reduction of (up to) 2*block_size many elements.
|
28
|
+
__shared__ /*{type}*/ sdata[/*{block_size}*/];
|
29
|
+
|
30
|
+
/*{odd}*/ = num_args % 2 == 1;
|
31
|
+
|
32
|
+
// --- FIRST REDUCTION --- Load from global memory
|
33
|
+
// Number of elements after the first reduction
|
34
|
+
num_args = num_args / 2 + num_args % 2;
|
35
|
+
|
36
|
+
if (thread_idx == num_args - 1 && /*{odd}*/)
|
37
|
+
{
|
38
|
+
// This is the last thread, and it should reduce only one element.
|
39
|
+
sdata[thread_idx] = /*{previous_result}*/[_tid_];
|
40
|
+
}
|
41
|
+
else
|
42
|
+
{
|
43
|
+
sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
|
44
|
+
}
|
45
|
+
|
46
|
+
__syncthreads();
|
47
|
+
|
48
|
+
|
49
|
+
// --- SUBSEQUENT REDUCTION --- Read from shared memory only
|
50
|
+
/*{odd}*/ = num_args % 2 == 1;
|
51
|
+
|
52
|
+
for (
|
53
|
+
num_args = num_args / 2 + num_args % 2; // Number of elements after this reduction
|
54
|
+
num_args > 1; // ... as long as there's at least 3 elements left
|
55
|
+
num_args = num_args / 2 + num_args % 2) {
|
56
|
+
|
57
|
+
if (thread_idx < num_args) {
|
58
|
+
// This thread has work to do...
|
59
|
+
|
60
|
+
if (thread_idx != num_args - 1 || !/*{odd}*/)
|
61
|
+
{
|
62
|
+
sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, sdata[thread_idx], sdata[thread_idx + num_args]);
|
63
|
+
}
|
64
|
+
else
|
65
|
+
{
|
66
|
+
// This is the last element and it is odd, do nothing
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
__syncthreads();
|
71
|
+
|
72
|
+
/*{odd}*/ = num_args % 2 == 1;
|
73
|
+
}
|
74
|
+
|
75
|
+
if (thread_idx == 0)
|
76
|
+
{
|
77
|
+
// Last thread returns result
|
78
|
+
/*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, sdata[0], sdata[1]);
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
// Write result to different position
|
83
|
+
_tid_ = blockIdx.x;
|
84
|
+
|
85
|
+
if (thread_idx != 0) {
|
86
|
+
// Only one thread should report the result
|
87
|
+
return;
|
88
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
/*{result_type}*/ /*{temp_var}*/;
|
2
|
+
|
3
|
+
// Indices for all dimensions
|
4
|
+
/*{compute_indices}*/
|
5
|
+
|
6
|
+
if (/*{out_of_bounds_check}*/)
|
7
|
+
{
|
8
|
+
// All value indices within bounds
|
9
|
+
/*{execution}*/
|
10
|
+
/*{temp_var}*/ = /*{stencil_computation}*/;
|
11
|
+
}
|
12
|
+
else
|
13
|
+
{
|
14
|
+
// At least one index is out of bounds
|
15
|
+
/*{temp_var}*/ = /*{out_of_bounds_fallback}*/;
|
16
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Ikra
|
2
|
+
module RubyIntegration
|
3
|
+
ALL_ARRAY_TYPES = proc do |type|
|
4
|
+
type.is_a?(Types::ArrayType) && !type.is_a?(Types::LocationAwareArrayType)
|
5
|
+
end
|
6
|
+
|
7
|
+
LOCATION_AWARE_ARRAY_TYPE = proc do |type|
|
8
|
+
# TODO: Maybe there should be an automated transfer to host side here if necessary?
|
9
|
+
type.is_a?(Types::LocationAwareArrayType)
|
10
|
+
end
|
11
|
+
|
12
|
+
LOCATION_AWARE_ARRAY_ACCESS = proc do |receiver, method_name, args, translator, result_type|
|
13
|
+
|
14
|
+
recv = receiver.accept(translator.expression_translator)
|
15
|
+
inner_type = receiver.get_type.singleton_type.inner_type.to_c_type
|
16
|
+
index = args[0].accept(translator.expression_translator)
|
17
|
+
|
18
|
+
"((#{inner_type} *) #{recv}.content)[#{index}]"
|
19
|
+
end
|
20
|
+
|
21
|
+
INNER_TYPE = proc do |rcvr|
|
22
|
+
rcvr.inner_type
|
23
|
+
end
|
24
|
+
|
25
|
+
implement ALL_ARRAY_TYPES, :[], INNER_TYPE, 1, "#0[#I1]"
|
26
|
+
|
27
|
+
implement(
|
28
|
+
LOCATION_AWARE_ARRAY_TYPE,
|
29
|
+
:[],
|
30
|
+
INNER_TYPE,
|
31
|
+
1,
|
32
|
+
LOCATION_AWARE_ARRAY_ACCESS)
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,313 @@
|
|
1
|
+
require_relative "../types/types/array_type.rb"
|
2
|
+
require_relative "../ast/interpreter.rb"
|
3
|
+
|
4
|
+
module Ikra
|
5
|
+
module RubyIntegration
|
6
|
+
|
7
|
+
# This visitor traverses the tree of symbolically executed parallel operations. It raises
|
8
|
+
# an exception, if an array command was generated by symbolic execution/interpretation of
|
9
|
+
# `send_node`.
|
10
|
+
class SymbolicCycleFinder < Symbolic::Visitor
|
11
|
+
def self.raise_on_cycle(command, send_node)
|
12
|
+
visitor = self.new(send_node)
|
13
|
+
command.accept(visitor)
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(send_node)
|
17
|
+
@send_node = send_node
|
18
|
+
end
|
19
|
+
|
20
|
+
def visit_array_command(node)
|
21
|
+
if node.generator_node == @send_node
|
22
|
+
raise CycleDetectedError.new(node: node)
|
23
|
+
else
|
24
|
+
# No cycle found yet, check dependent computations
|
25
|
+
super
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class CycleDetectedError < RuntimeError
|
31
|
+
def initialize(node:)
|
32
|
+
@node = node
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
ALL_ARRAY_COMMAND_TYPES = proc do |type|
|
37
|
+
type.is_a?(Symbolic::ArrayCommand)
|
38
|
+
end
|
39
|
+
|
40
|
+
PMAP_TYPE = proc do |rcvr_type, *args_types, send_node:|
|
41
|
+
# TODO: Handle keyword arguments
|
42
|
+
|
43
|
+
# Ensure that there is no cycle here. "Cycle" means that the same AST send node
|
44
|
+
# was used earlier (i.e., in one of `rcvr_type`'s inputs/dependent computations).
|
45
|
+
# In that case we have to abort type inference here, because it would not terminate.
|
46
|
+
SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
|
47
|
+
|
48
|
+
more_kw_args = {}
|
49
|
+
|
50
|
+
if send_node.arguments.size == 1
|
51
|
+
if !send_node.arguments.first.is_a?(AST::HashNode)
|
52
|
+
raise ArgumentError.new("If an argument is given, it must be a Hash of kwargs.")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Pass kwargs separately
|
56
|
+
more_kw_args = AST::Interpreter.interpret(send_node.arguments.first)
|
57
|
+
end
|
58
|
+
|
59
|
+
rcvr_type.pmap(
|
60
|
+
ast: send_node.block_argument,
|
61
|
+
generator_node: send_node,
|
62
|
+
# TODO: Fix binding
|
63
|
+
command_binding: send_node.find_behavior_node.binding,
|
64
|
+
**more_kw_args).to_union_type
|
65
|
+
end
|
66
|
+
|
67
|
+
PZIP_TYPE = proc do |rcvr_type, *args_types, send_node:|
|
68
|
+
# TODO: Support multiple arguments for `pzip`
|
69
|
+
types = args_types[0].map do |sing_type|
|
70
|
+
raise AssertionError.new("Singleton type expected") if sing_type.is_union_type?
|
71
|
+
rcvr_type.pzip(sing_type, generator_node: send_node).to_union_type
|
72
|
+
end
|
73
|
+
|
74
|
+
types.reduce(Types::UnionType.new) do |acc, type|
|
75
|
+
acc.expand_return_type(type)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
PSTENCIL_TYPE = proc do |rcvr_type, *args_types, send_node:|
|
80
|
+
# TODO: Handle keyword arguments
|
81
|
+
ruby_args = send_node.arguments.map do |node|
|
82
|
+
AST::Interpreter.interpret(node)
|
83
|
+
end
|
84
|
+
|
85
|
+
more_kw_args = {}
|
86
|
+
|
87
|
+
if args_types.size == 3
|
88
|
+
if !ruby_args.last.is_a?(Hash)
|
89
|
+
raise ArgumentError.new("If 3 arguments are given, the last one must be a Hash of kwargs.")
|
90
|
+
end
|
91
|
+
|
92
|
+
# Pass kwargs separately
|
93
|
+
more_kw_args = ruby_args.pop
|
94
|
+
end
|
95
|
+
|
96
|
+
SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
|
97
|
+
|
98
|
+
rcvr_type.pstencil(
|
99
|
+
*ruby_args,
|
100
|
+
ast: send_node.block_argument,
|
101
|
+
generator_node: send_node,
|
102
|
+
# TODO: Fix binding
|
103
|
+
command_binding: send_node.find_behavior_node.binding,
|
104
|
+
**more_kw_args).to_union_type
|
105
|
+
end
|
106
|
+
|
107
|
+
PREDUCE_TYPE = proc do |rcvr_type, *args_types, send_node:|
|
108
|
+
# TODO: Handle keyword arguments
|
109
|
+
|
110
|
+
SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
|
111
|
+
|
112
|
+
rcvr_type.preduce(ast: send_node.block_argument, generator_node: send_node).to_union_type
|
113
|
+
end
|
114
|
+
|
115
|
+
LAUNCH_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
|
116
|
+
# The result type is the symbolically executed result of applying this
|
117
|
+
# parallel section. The result type is an ArrayCommand.
|
118
|
+
array_command = receiver.get_type.singleton_type
|
119
|
+
|
120
|
+
# Translate command
|
121
|
+
command_translator = translator.command_translator
|
122
|
+
command_translator.push_kernel_launcher
|
123
|
+
result = array_command.accept(command_translator)
|
124
|
+
kernel_launcher = command_translator.pop_kernel_launcher(result)
|
125
|
+
|
126
|
+
# Prepare kernel launchers for launch of `array_command`
|
127
|
+
command_translator.program_builder.prepare_additional_args_for_launch(array_command)
|
128
|
+
|
129
|
+
# Generate launch code for all kernels
|
130
|
+
launch_code = command_translator.program_builder.build_kernel_launchers
|
131
|
+
|
132
|
+
# Always return a device pointer. Only at the very end, we transfer data to the host.
|
133
|
+
result_expr = kernel_launcher.kernel_result_var_name
|
134
|
+
|
135
|
+
if Translator::ArrayCommandStructBuilder::RequireRuntimeSizeChecker.require_size_function?(array_command)
|
136
|
+
|
137
|
+
# Size is not statically known, take information from receiver.
|
138
|
+
# TODO: Code depends on template. `cmd` is defined in template.
|
139
|
+
result_size = "cmd->size()"
|
140
|
+
else
|
141
|
+
# Size is known statically
|
142
|
+
result_size = array_command.size.to_s
|
143
|
+
end
|
144
|
+
|
145
|
+
# Debug information
|
146
|
+
if array_command.generator_node != nil
|
147
|
+
debug_information = array_command.to_s + ": " + array_command.generator_node.to_s
|
148
|
+
else
|
149
|
+
debug_information = array_command.to_s
|
150
|
+
end
|
151
|
+
|
152
|
+
result = Translator.read_file(file_name: "host_section_launch_parallel_section.cpp", replacements: {
|
153
|
+
"debug_information" => debug_information,
|
154
|
+
"array_command" => receiver.accept(translator.expression_translator),
|
155
|
+
"array_command_type" => array_command.to_c_type,
|
156
|
+
"result_size" => result_size,
|
157
|
+
"kernel_invocation" => launch_code,
|
158
|
+
"kernel_result" => result_expr,
|
159
|
+
"free_memory" => command_translator.program_builder.build_memory_free_except_last})
|
160
|
+
|
161
|
+
# Clear kernel launchers. Otherwise, we might launch them again in a later, unrelated
|
162
|
+
# LAUNCH_KERNEL branch. This is because we reuse the same [ProgramBuilder] for an
|
163
|
+
# entire host section.
|
164
|
+
command_translator.program_builder.clear_kernel_launchers
|
165
|
+
|
166
|
+
# Build all array command structs for this command
|
167
|
+
command_translator.program_builder.add_array_command_struct(
|
168
|
+
*Translator::ArrayCommandStructBuilder.build_all_structs(array_command))
|
169
|
+
|
170
|
+
result
|
171
|
+
end
|
172
|
+
|
173
|
+
ARRAY_COMMAND_TO_ARRAY_TYPE = proc do |rcvr_type, *args_types, send_node:|
|
174
|
+
Types::LocationAwareFixedSizeArrayType.new(
|
175
|
+
rcvr_type.result_type,
|
176
|
+
rcvr_type.dimensions,
|
177
|
+
location: :device).to_union_type
|
178
|
+
end
|
179
|
+
|
180
|
+
SYMBOLICALLY_EXECUTE_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
|
181
|
+
if !result_type.is_singleton?
|
182
|
+
raise AssertionError.new("Singleton type expected")
|
183
|
+
end
|
184
|
+
|
185
|
+
# Build arguments to constructor. First one (result field) is NULL.
|
186
|
+
constructor_args = ["NULL"]
|
187
|
+
|
188
|
+
# Translate all inputs (receiver, then arguments to parallel section)
|
189
|
+
constructor_args.push(receiver.accept(translator.expression_translator))
|
190
|
+
|
191
|
+
for arg in arguments
|
192
|
+
if arg.get_type.is_singleton? &&
|
193
|
+
arg.get_type.singleton_type.is_a?(Symbolic::ArrayCommand)
|
194
|
+
|
195
|
+
# Only ArrayCommands should show up as arguments
|
196
|
+
constructor_args.push(arg.accept(translator.expression_translator))
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
all_args = constructor_args.join(", ")
|
201
|
+
|
202
|
+
# This is a hack because the type is a pointer type
|
203
|
+
"new #{result_type.singleton_type.to_c_type[0...-2]}(#{all_args})"
|
204
|
+
end
|
205
|
+
|
206
|
+
ALL_LOCATION_AWARE_ARRAY_TYPES = proc do |type|
|
207
|
+
type.is_a?(Types::LocationAwareArrayType)
|
208
|
+
end
|
209
|
+
|
210
|
+
LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE = proc do |rcvr_type, *args_types|
|
211
|
+
# TODO: Should also be able to handle variable variant
|
212
|
+
Types::LocationAwareFixedSizeArrayType.new(
|
213
|
+
rcvr_type.inner_type,
|
214
|
+
rcvr_type.dimensions,
|
215
|
+
location: :host).to_union_type
|
216
|
+
end
|
217
|
+
|
218
|
+
LOCATION_AWARE_ARRAY_CALL_TYPE = proc do |rcvr_type, *args_types|
|
219
|
+
# Calling `__call__` on an array does not do anything
|
220
|
+
rcvr_type.to_union_type
|
221
|
+
end
|
222
|
+
|
223
|
+
COPY_ARRAY_TO_HOST = proc do |receiver, method_name, args, translator, result_type|
|
224
|
+
if receiver.get_type.singleton_type.location == :host
|
225
|
+
receiver.accept(translator.expression_translator)
|
226
|
+
else
|
227
|
+
c_type = receiver.get_type.singleton_type.inner_type.to_c_type
|
228
|
+
|
229
|
+
Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
|
230
|
+
"type" => c_type,
|
231
|
+
"device_array" => receiver.accept(translator.expression_translator)})
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
ARRAY_TYPE_TO_COMMAND_TYPE = proc do |rcvr_type, *args_types, send_node:|
|
236
|
+
rcvr_type.to_command.to_union_type
|
237
|
+
end
|
238
|
+
|
239
|
+
FREE_MEMORY_FOR_ARRAY_COMMAND = proc do |receiver, method_name, args, translator, result_type|
|
240
|
+
|
241
|
+
Translator.read_file(file_name: "free_memory_for_command.cpp", replacements: {
|
242
|
+
"type" => receiver.get_type.to_c_type,
|
243
|
+
"receiver" => receiver.accept(translator.expression_translator)})
|
244
|
+
end
|
245
|
+
|
246
|
+
# Manually free memory
|
247
|
+
# TODO: Implement escape analysis and try to reuse memory
|
248
|
+
implement(
|
249
|
+
ALL_ARRAY_COMMAND_TYPES,
|
250
|
+
:free_memory,
|
251
|
+
BOOL,
|
252
|
+
0,
|
253
|
+
FREE_MEMORY_FOR_ARRAY_COMMAND)
|
254
|
+
|
255
|
+
# Implement all parallel operations
|
256
|
+
implement(
|
257
|
+
ALL_ARRAY_COMMAND_TYPES,
|
258
|
+
:pmap,
|
259
|
+
PMAP_TYPE,
|
260
|
+
0..1,
|
261
|
+
SYMBOLICALLY_EXECUTE_KERNEL)
|
262
|
+
|
263
|
+
implement(
|
264
|
+
ALL_LOCATION_AWARE_ARRAY_TYPES,
|
265
|
+
:to_command,
|
266
|
+
ARRAY_TYPE_TO_COMMAND_TYPE,
|
267
|
+
0,
|
268
|
+
SYMBOLICALLY_EXECUTE_KERNEL)
|
269
|
+
|
270
|
+
implement(
|
271
|
+
ALL_ARRAY_COMMAND_TYPES,
|
272
|
+
:pzip,
|
273
|
+
PZIP_TYPE,
|
274
|
+
1,
|
275
|
+
SYMBOLICALLY_EXECUTE_KERNEL,
|
276
|
+
expect_singleton_args: true)
|
277
|
+
|
278
|
+
implement(
|
279
|
+
ALL_ARRAY_COMMAND_TYPES,
|
280
|
+
:pstencil,
|
281
|
+
PSTENCIL_TYPE,
|
282
|
+
2..3, # neighborhood and default value, maybe hash
|
283
|
+
SYMBOLICALLY_EXECUTE_KERNEL)
|
284
|
+
|
285
|
+
implement(
|
286
|
+
ALL_ARRAY_COMMAND_TYPES,
|
287
|
+
:preduce,
|
288
|
+
PREDUCE_TYPE,
|
289
|
+
0,
|
290
|
+
SYMBOLICALLY_EXECUTE_KERNEL)
|
291
|
+
|
292
|
+
implement(
|
293
|
+
ALL_ARRAY_COMMAND_TYPES,
|
294
|
+
:__call__,
|
295
|
+
ARRAY_COMMAND_TO_ARRAY_TYPE,
|
296
|
+
0,
|
297
|
+
LAUNCH_KERNEL)
|
298
|
+
|
299
|
+
implement(
|
300
|
+
ALL_LOCATION_AWARE_ARRAY_TYPES,
|
301
|
+
:__to_host_array__,
|
302
|
+
LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE,
|
303
|
+
0,
|
304
|
+
COPY_ARRAY_TO_HOST)
|
305
|
+
|
306
|
+
implement(
|
307
|
+
ALL_LOCATION_AWARE_ARRAY_TYPES,
|
308
|
+
:__call__,
|
309
|
+
LOCATION_AWARE_ARRAY_CALL_TYPE,
|
310
|
+
0,
|
311
|
+
"#0")
|
312
|
+
end
|
313
|
+
end
|