ikra 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ast/builder.rb +225 -77
  3. data/lib/ast/host_section_builder.rb +38 -0
  4. data/lib/ast/interpreter.rb +67 -0
  5. data/lib/ast/lexical_variables_enumerator.rb +3 -2
  6. data/lib/ast/nodes.rb +521 -31
  7. data/lib/ast/printer.rb +116 -18
  8. data/lib/ast/ssa_generator.rb +192 -0
  9. data/lib/ast/visitor.rb +235 -21
  10. data/lib/config/configuration.rb +28 -3
  11. data/lib/config/os_configuration.rb +62 -9
  12. data/lib/cpu/cpu_implementation.rb +39 -0
  13. data/lib/ikra.rb +13 -3
  14. data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
  15. data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
  16. data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
  17. data/lib/resources/cuda/ast/assignment.cpp +1 -0
  18. data/lib/resources/cuda/block_function_head.cpp +7 -1
  19. data/lib/resources/cuda/entry_point.cpp +47 -0
  20. data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
  21. data/lib/resources/cuda/free_device_memory.cpp +3 -0
  22. data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
  23. data/lib/resources/cuda/header.cpp +23 -9
  24. data/lib/resources/cuda/header_structs.cpp +92 -0
  25. data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
  26. data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
  27. data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
  28. data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
  29. data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
  30. data/lib/resources/cuda/kernel.cpp +9 -2
  31. data/lib/resources/cuda/launch_kernel.cpp +5 -0
  32. data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
  33. data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
  34. data/lib/resources/cuda/reduce_body.cpp +88 -0
  35. data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
  36. data/lib/resources/cuda/stencil_body.cpp +16 -0
  37. data/lib/resources/cuda/struct_definition.cpp +4 -0
  38. data/lib/ruby_core/array.rb +34 -0
  39. data/lib/ruby_core/array_command.rb +313 -0
  40. data/lib/ruby_core/core.rb +103 -0
  41. data/lib/ruby_core/interpreter.rb +16 -0
  42. data/lib/ruby_core/math.rb +32 -0
  43. data/lib/ruby_core/ruby_integration.rb +256 -0
  44. data/lib/symbolic/host_section.rb +115 -0
  45. data/lib/symbolic/input.rb +87 -0
  46. data/lib/symbolic/input_visitor.rb +68 -0
  47. data/lib/symbolic/symbolic.rb +793 -117
  48. data/lib/symbolic/visitor.rb +70 -8
  49. data/lib/translator/array_command_struct_builder.rb +163 -0
  50. data/lib/translator/ast_translator.rb +572 -0
  51. data/lib/translator/block_translator.rb +104 -48
  52. data/lib/translator/commands/array_combine_command.rb +41 -0
  53. data/lib/translator/commands/array_identity_command.rb +28 -0
  54. data/lib/translator/commands/array_index_command.rb +52 -0
  55. data/lib/translator/commands/array_reduce_command.rb +135 -0
  56. data/lib/translator/commands/array_stencil_command.rb +129 -0
  57. data/lib/translator/commands/array_zip_command.rb +30 -0
  58. data/lib/translator/commands/command_translator.rb +264 -0
  59. data/lib/translator/cuda_errors.rb +32 -0
  60. data/lib/translator/environment_builder.rb +263 -0
  61. data/lib/translator/host_section/array_host_section_command.rb +150 -0
  62. data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
  63. data/lib/translator/host_section/ast_translator.rb +14 -0
  64. data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
  65. data/lib/translator/host_section/program_builder.rb +89 -0
  66. data/lib/translator/input_translator.rb +226 -0
  67. data/lib/translator/kernel_builder.rb +137 -0
  68. data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
  69. data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
  70. data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
  71. data/lib/translator/last_returns_visitor.rb +19 -10
  72. data/lib/translator/program_builder.rb +197 -0
  73. data/lib/translator/program_launcher.rb +273 -0
  74. data/lib/translator/struct_type.rb +55 -0
  75. data/lib/translator/translator.rb +34 -11
  76. data/lib/translator/variable_classifier_visitor.rb +56 -0
  77. data/lib/types/inference/ast_inference.rb +586 -0
  78. data/lib/types/inference/clear_types_visitor.rb +11 -0
  79. data/lib/types/inference/command_inference.rb +101 -0
  80. data/lib/types/inference/input_inference.rb +62 -0
  81. data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
  82. data/lib/types/inference/ruby_extension.rb +35 -0
  83. data/lib/types/inference/symbol_table.rb +131 -0
  84. data/lib/types/types.rb +14 -0
  85. data/lib/types/types/array_command_type.rb +123 -0
  86. data/lib/types/types/array_type.rb +137 -0
  87. data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
  88. data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
  89. data/lib/types/types/ruby_type.rb +88 -0
  90. data/lib/types/types/struct_type.rb +179 -0
  91. data/lib/types/types/union_type.rb +239 -0
  92. metadata +160 -18
  93. data/lib/ast/method_definition.rb +0 -37
  94. data/lib/ast/translator.rb +0 -264
  95. data/lib/resources/cuda/kernel_launcher.cpp +0 -28
  96. data/lib/scope.rb +0 -166
  97. data/lib/translator/command_translator.rb +0 -421
  98. data/lib/translator/local_variables_enumerator.rb +0 -35
  99. data/lib/translator/method_translator.rb +0 -24
  100. data/lib/types/array_type.rb +0 -51
  101. data/lib/types/ruby_extension.rb +0 -67
  102. data/lib/types/ruby_type.rb +0 -45
  103. data/lib/types/type_inference.rb +0 -382
  104. data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,12 @@
1
+ #undef checkErrorReturn
2
+ #define checkErrorReturn(result_var, expr) \
3
+ if (result_var->last_error = expr) \
4
+ {\
5
+ cudaError_t error = cudaGetLastError();\
6
+ printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
7
+ cudaDeviceReset();\
8
+ return /*{result_type}*/::error_return_value;\
9
+ }
10
+
11
+ /*{result_type}*/ /*{name}*/(/*{parameters}*/)
12
+ /*{body}*/
@@ -0,0 +1,55 @@
1
+ #undef checkErrorReturn
2
+ #define checkErrorReturn(result_var, expr) \
3
+ if (result_var->last_error = expr) \
4
+ {\
5
+ cudaError_t error = cudaGetLastError();\
6
+ printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
7
+ cudaDeviceReset();\
8
+ return result_var;\
9
+ }
10
+
11
+ extern "C" EXPORT result_t *launch_kernel(environment_t */*{host_env_var_name}*/)
12
+ {
13
+ // CUDA Initialization
14
+ program_result = new result_t();
15
+ program_result->device_allocations = new vector<void*>();
16
+
17
+ timeStartMeasure();
18
+
19
+ cudaError_t cudaStatus = cudaSetDevice(0);
20
+
21
+ if (cudaStatus != cudaSuccess) {
22
+ fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
23
+ program_result->last_error = -1;
24
+ return program_result;
25
+ }
26
+
27
+ checkErrorReturn(program_result, cudaFree(0));
28
+
29
+ timeReportMeasure(program_result, setup_cuda);
30
+
31
+
32
+ /* Prepare environment */
33
+ /*{prepare_environment}*/
34
+
35
+
36
+ /* Copy back memory and set pointer of result */
37
+ program_result->result = /*{host_result_array}*/;
38
+
39
+ /* Free device memory */
40
+ timeStartMeasure();
41
+
42
+ for (
43
+ auto device_ptr = program_result->device_allocations->begin();
44
+ device_ptr < program_result->device_allocations->end();
45
+ device_ptr++)
46
+ {
47
+ checkErrorReturn(program_result, cudaFree(*device_ptr));
48
+ }
49
+
50
+ delete program_result->device_allocations;
51
+
52
+ timeReportMeasure(program_result, free_memory);
53
+
54
+ return program_result;
55
+ }
@@ -0,0 +1,18 @@
1
+ timeStartMeasure();
2
+
3
+ if (/*{name}*/ != cmd->result) {
4
+ // Don't free memory if it is the result. There is already a similar check in
5
+ // program_builder (free all except for last). However, this check is not sufficient in
6
+ // case the same array is reused!
7
+
8
+ checkErrorReturn(program_result, cudaFree(/*{name}*/));
9
+ // Remove from list of allocations
10
+ program_result->device_allocations->erase(
11
+ std::remove(
12
+ program_result->device_allocations->begin(),
13
+ program_result->device_allocations->end(),
14
+ /*{name}*/),
15
+ program_result->device_allocations->end());
16
+ }
17
+
18
+ timeReportMeasure(program_result, free_memory);
@@ -0,0 +1,14 @@
1
+ ({
2
+ // /*{debug_information}*/
3
+
4
+ /*{array_command_type}*/ cmd = /*{array_command}*/;
5
+
6
+ if (cmd->result == 0) {
7
+ /*{kernel_invocation}*/
8
+ cmd->result = /*{kernel_result}*/;
9
+
10
+ /*{free_memory}*/
11
+ }
12
+
13
+ variable_size_array_t((void *) cmd->result, /*{result_size}*/);
14
+ })
@@ -0,0 +1,10 @@
1
+
2
+ {
3
+ /*{type}*/ * tmp_result = (/*{type}*/ *) malloc(/*{bytes}*/);
4
+
5
+ timeStartMeasure();
6
+ checkErrorReturn(program_result, cudaMemcpy(tmp_result, program_result->result, /*{bytes}*/, cudaMemcpyDeviceToHost));
7
+ timeReportMeasure(program_result, transfer_memory);
8
+
9
+ program_result->result = tmp_result;
10
+ }
@@ -1,8 +1,15 @@
1
1
 
2
2
 
3
- __global__ void kernel(environment_t */*{env_identifier}*/, /*{result_type}*/ *_result_)
3
+ __global__ void /*{kernel_name}*/(/*{parameters}*/)
4
4
  {
5
- _result_[threadIdx.x + blockIdx.x * blockDim.x] = /*{block_invocation}*/;
5
+ int _tid_ = threadIdx.x + blockIdx.x * blockDim.x;
6
+
7
+ if (_tid_ < /*{num_threads}*/)
8
+ {
9
+ /*{execution}*/
10
+
11
+ _result_[_tid_] = /*{block_invocation}*/;
12
+ }
6
13
  }
7
14
 
8
15
 
@@ -0,0 +1,5 @@
1
+ timeStartMeasure();
2
+ /*{kernel_name}*/<<</*{grid_dim}*/, /*{block_dim}*/>>>(/*{arguments}*/);
3
+ checkErrorReturn(program_result, cudaPeekAtLastError());
4
+ checkErrorReturn(program_result, cudaThreadSynchronize());
5
+ timeReportMeasure(program_result, kernel);
@@ -0,0 +1,3 @@
1
+ timeStartMeasure();
2
+ checkErrorReturn(program_result, cudaMemcpy(/*{host_name}*/, /*{device_name}*/, /*{bytes}*/, cudaMemcpyDeviceToHost));
3
+ timeReportMeasure(program_result, transfer_memory);
@@ -0,0 +1,10 @@
1
+ ({
2
+ variable_size_array_t device_array = /*{device_array}*/;
3
+ /*{type}*/ * tmp_result = (/*{type}*/ *) malloc(sizeof(/*{type}*/) * device_array.size);
4
+
5
+ timeStartMeasure();
6
+ checkErrorReturn(program_result, cudaMemcpy(tmp_result, device_array.content, sizeof(/*{type}*/) * device_array.size, cudaMemcpyDeviceToHost));
7
+ timeReportMeasure(program_result, transfer_memory);
8
+
9
+ variable_size_array_t((void *) tmp_result, device_array.size);
10
+ })
@@ -0,0 +1,88 @@
1
+ int thread_idx = threadIdx.x;
2
+
3
+ // Single result of this block
4
+ /*{type}*/ /*{temp_result}*/;
5
+
6
+ int num_args = 2 * /*{block_size}*/;
7
+ if (blockIdx.x == gridDim.x - 1)
8
+ {
9
+ // Processing the last block, which might be odd (number of elements to reduce).
10
+ // Other blocks cannot be "odd", because every block reduces 2*block_size many elements.
11
+
12
+ // Number of elements to reduce in the last block
13
+ num_args = ((2 * /*{num_threads}*/ - 1) % (2 * /*{block_size}*/)) + (/*{odd}*/ ? 0 : 1);
14
+ }
15
+
16
+ if (num_args == 1)
17
+ {
18
+ /*{temp_result}*/ = /*{previous_result}*/[_tid_];
19
+ }
20
+ else if (num_args == 2)
21
+ {
22
+ /*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
23
+ }
24
+ else
25
+ {
26
+ // Allocate block_size many slots to contain the result of up to block_size many reductions, i.e.,
27
+ // this array contains the reduction of (up to) 2*block_size many elements.
28
+ __shared__ /*{type}*/ sdata[/*{block_size}*/];
29
+
30
+ /*{odd}*/ = num_args % 2 == 1;
31
+
32
+ // --- FIRST REDUCTION --- Load from global memory
33
+ // Number of elements after the first reduction
34
+ num_args = num_args / 2 + num_args % 2;
35
+
36
+ if (thread_idx == num_args - 1 && /*{odd}*/)
37
+ {
38
+ // This is the last thread, and it should reduce only one element.
39
+ sdata[thread_idx] = /*{previous_result}*/[_tid_];
40
+ }
41
+ else
42
+ {
43
+ sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
44
+ }
45
+
46
+ __syncthreads();
47
+
48
+
49
+ // --- SUBSEQUENT REDUCTION --- Read from shared memory only
50
+ /*{odd}*/ = num_args % 2 == 1;
51
+
52
+ for (
53
+ num_args = num_args / 2 + num_args % 2; // Number of elements after this reduction
54
+ num_args > 1; // ... as long as there's at least 3 elements left
55
+ num_args = num_args / 2 + num_args % 2) {
56
+
57
+ if (thread_idx < num_args) {
58
+ // This thread has work to do...
59
+
60
+ if (thread_idx != num_args - 1 || !/*{odd}*/)
61
+ {
62
+ sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, sdata[thread_idx], sdata[thread_idx + num_args]);
63
+ }
64
+ else
65
+ {
66
+ // This is the last element and it is odd, do nothing
67
+ }
68
+ }
69
+
70
+ __syncthreads();
71
+
72
+ /*{odd}*/ = num_args % 2 == 1;
73
+ }
74
+
75
+ if (thread_idx == 0)
76
+ {
77
+ // Last thread returns result
78
+ /*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, sdata[0], sdata[1]);
79
+ }
80
+ }
81
+
82
+ // Write result to different position
83
+ _tid_ = blockIdx.x;
84
+
85
+ if (thread_idx != 0) {
86
+ // Only one thread should report the result
87
+ return;
88
+ }
@@ -0,0 +1,2 @@
1
+ // (Re)construct array from separately passed parameters
2
+ /*{type}*/ /*{name}*/[] = /*{initializer}*/;
@@ -0,0 +1,16 @@
1
+ /*{result_type}*/ /*{temp_var}*/;
2
+
3
+ // Indices for all dimensions
4
+ /*{compute_indices}*/
5
+
6
+ if (/*{out_of_bounds_check}*/)
7
+ {
8
+ // All value indices within bounds
9
+ /*{execution}*/
10
+ /*{temp_var}*/ = /*{stencil_computation}*/;
11
+ }
12
+ else
13
+ {
14
+ // At least one index is out of bounds
15
+ /*{temp_var}*/ = /*{out_of_bounds_fallback}*/;
16
+ }
@@ -0,0 +1,4 @@
1
+ struct /*{name}*/
2
+ {
3
+ /*{fields}*/
4
+ };
@@ -0,0 +1,34 @@
1
+ module Ikra
2
+ module RubyIntegration
3
+ ALL_ARRAY_TYPES = proc do |type|
4
+ type.is_a?(Types::ArrayType) && !type.is_a?(Types::LocationAwareArrayType)
5
+ end
6
+
7
+ LOCATION_AWARE_ARRAY_TYPE = proc do |type|
8
+ # TODO: Maybe there should be an automated transfer to host side here if necessary?
9
+ type.is_a?(Types::LocationAwareArrayType)
10
+ end
11
+
12
+ LOCATION_AWARE_ARRAY_ACCESS = proc do |receiver, method_name, args, translator, result_type|
13
+
14
+ recv = receiver.accept(translator.expression_translator)
15
+ inner_type = receiver.get_type.singleton_type.inner_type.to_c_type
16
+ index = args[0].accept(translator.expression_translator)
17
+
18
+ "((#{inner_type} *) #{recv}.content)[#{index}]"
19
+ end
20
+
21
+ INNER_TYPE = proc do |rcvr|
22
+ rcvr.inner_type
23
+ end
24
+
25
+ implement ALL_ARRAY_TYPES, :[], INNER_TYPE, 1, "#0[#I1]"
26
+
27
+ implement(
28
+ LOCATION_AWARE_ARRAY_TYPE,
29
+ :[],
30
+ INNER_TYPE,
31
+ 1,
32
+ LOCATION_AWARE_ARRAY_ACCESS)
33
+ end
34
+ end
@@ -0,0 +1,313 @@
1
+ require_relative "../types/types/array_type.rb"
2
+ require_relative "../ast/interpreter.rb"
3
+
4
+ module Ikra
5
+ module RubyIntegration
6
+
7
+ # This visitor traverses the tree of symbolically executed parallel operations. It raises
8
+ # an exception, if an array command was generated by symbolic execution/interpretation of
9
+ # `send_node`.
10
+ class SymbolicCycleFinder < Symbolic::Visitor
11
+ def self.raise_on_cycle(command, send_node)
12
+ visitor = self.new(send_node)
13
+ command.accept(visitor)
14
+ end
15
+
16
+ def initialize(send_node)
17
+ @send_node = send_node
18
+ end
19
+
20
+ def visit_array_command(node)
21
+ if node.generator_node == @send_node
22
+ raise CycleDetectedError.new(node: node)
23
+ else
24
+ # No cycle found yet, check dependent computations
25
+ super
26
+ end
27
+ end
28
+ end
29
+
30
+ class CycleDetectedError < RuntimeError
31
+ def initialize(node:)
32
+ @node = node
33
+ end
34
+ end
35
+
36
+ ALL_ARRAY_COMMAND_TYPES = proc do |type|
37
+ type.is_a?(Symbolic::ArrayCommand)
38
+ end
39
+
40
+ PMAP_TYPE = proc do |rcvr_type, *args_types, send_node:|
41
+ # TODO: Handle keyword arguments
42
+
43
+ # Ensure that there is no cycle here. "Cycle" means that the same AST send node
44
+ # was used earlier (i.e., in one of `rcvr_type`'s inputs/dependent computations).
45
+ # In that case we have to abort type inference here, because it would not terminate.
46
+ SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
47
+
48
+ more_kw_args = {}
49
+
50
+ if send_node.arguments.size == 1
51
+ if !send_node.arguments.first.is_a?(AST::HashNode)
52
+ raise ArgumentError.new("If an argument is given, it must be a Hash of kwargs.")
53
+ end
54
+
55
+ # Pass kwargs separately
56
+ more_kw_args = AST::Interpreter.interpret(send_node.arguments.first)
57
+ end
58
+
59
+ rcvr_type.pmap(
60
+ ast: send_node.block_argument,
61
+ generator_node: send_node,
62
+ # TODO: Fix binding
63
+ command_binding: send_node.find_behavior_node.binding,
64
+ **more_kw_args).to_union_type
65
+ end
66
+
67
+ PZIP_TYPE = proc do |rcvr_type, *args_types, send_node:|
68
+ # TODO: Support multiple arguments for `pzip`
69
+ types = args_types[0].map do |sing_type|
70
+ raise AssertionError.new("Singleton type expected") if sing_type.is_union_type?
71
+ rcvr_type.pzip(sing_type, generator_node: send_node).to_union_type
72
+ end
73
+
74
+ types.reduce(Types::UnionType.new) do |acc, type|
75
+ acc.expand_return_type(type)
76
+ end
77
+ end
78
+
79
+ PSTENCIL_TYPE = proc do |rcvr_type, *args_types, send_node:|
80
+ # TODO: Handle keyword arguments
81
+ ruby_args = send_node.arguments.map do |node|
82
+ AST::Interpreter.interpret(node)
83
+ end
84
+
85
+ more_kw_args = {}
86
+
87
+ if args_types.size == 3
88
+ if !ruby_args.last.is_a?(Hash)
89
+ raise ArgumentError.new("If 3 arguments are given, the last one must be a Hash of kwargs.")
90
+ end
91
+
92
+ # Pass kwargs separately
93
+ more_kw_args = ruby_args.pop
94
+ end
95
+
96
+ SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
97
+
98
+ rcvr_type.pstencil(
99
+ *ruby_args,
100
+ ast: send_node.block_argument,
101
+ generator_node: send_node,
102
+ # TODO: Fix binding
103
+ command_binding: send_node.find_behavior_node.binding,
104
+ **more_kw_args).to_union_type
105
+ end
106
+
107
+ PREDUCE_TYPE = proc do |rcvr_type, *args_types, send_node:|
108
+ # TODO: Handle keyword arguments
109
+
110
+ SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
111
+
112
+ rcvr_type.preduce(ast: send_node.block_argument, generator_node: send_node).to_union_type
113
+ end
114
+
115
+ LAUNCH_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
116
+ # The result type is the symbolically executed result of applying this
117
+ # parallel section. The result type is an ArrayCommand.
118
+ array_command = receiver.get_type.singleton_type
119
+
120
+ # Translate command
121
+ command_translator = translator.command_translator
122
+ command_translator.push_kernel_launcher
123
+ result = array_command.accept(command_translator)
124
+ kernel_launcher = command_translator.pop_kernel_launcher(result)
125
+
126
+ # Prepare kernel launchers for launch of `array_command`
127
+ command_translator.program_builder.prepare_additional_args_for_launch(array_command)
128
+
129
+ # Generate launch code for all kernels
130
+ launch_code = command_translator.program_builder.build_kernel_launchers
131
+
132
+ # Always return a device pointer. Only at the very end, we transfer data to the host.
133
+ result_expr = kernel_launcher.kernel_result_var_name
134
+
135
+ if Translator::ArrayCommandStructBuilder::RequireRuntimeSizeChecker.require_size_function?(array_command)
136
+
137
+ # Size is not statically known, take information from receiver.
138
+ # TODO: Code depends on template. `cmd` is defined in template.
139
+ result_size = "cmd->size()"
140
+ else
141
+ # Size is known statically
142
+ result_size = array_command.size.to_s
143
+ end
144
+
145
+ # Debug information
146
+ if array_command.generator_node != nil
147
+ debug_information = array_command.to_s + ": " + array_command.generator_node.to_s
148
+ else
149
+ debug_information = array_command.to_s
150
+ end
151
+
152
+ result = Translator.read_file(file_name: "host_section_launch_parallel_section.cpp", replacements: {
153
+ "debug_information" => debug_information,
154
+ "array_command" => receiver.accept(translator.expression_translator),
155
+ "array_command_type" => array_command.to_c_type,
156
+ "result_size" => result_size,
157
+ "kernel_invocation" => launch_code,
158
+ "kernel_result" => result_expr,
159
+ "free_memory" => command_translator.program_builder.build_memory_free_except_last})
160
+
161
+ # Clear kernel launchers. Otherwise, we might launch them again in a later, unrelated
162
+ # LAUNCH_KERNEL branch. This is because we reuse the same [ProgramBuilder] for an
163
+ # entire host section.
164
+ command_translator.program_builder.clear_kernel_launchers
165
+
166
+ # Build all array command structs for this command
167
+ command_translator.program_builder.add_array_command_struct(
168
+ *Translator::ArrayCommandStructBuilder.build_all_structs(array_command))
169
+
170
+ result
171
+ end
172
+
173
+ ARRAY_COMMAND_TO_ARRAY_TYPE = proc do |rcvr_type, *args_types, send_node:|
174
+ Types::LocationAwareFixedSizeArrayType.new(
175
+ rcvr_type.result_type,
176
+ rcvr_type.dimensions,
177
+ location: :device).to_union_type
178
+ end
179
+
180
+ SYMBOLICALLY_EXECUTE_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
181
+ if !result_type.is_singleton?
182
+ raise AssertionError.new("Singleton type expected")
183
+ end
184
+
185
+ # Build arguments to constructor. First one (result field) is NULL.
186
+ constructor_args = ["NULL"]
187
+
188
+ # Translate all inputs (receiver, then arguments to parallel section)
189
+ constructor_args.push(receiver.accept(translator.expression_translator))
190
+
191
+ for arg in arguments
192
+ if arg.get_type.is_singleton? &&
193
+ arg.get_type.singleton_type.is_a?(Symbolic::ArrayCommand)
194
+
195
+ # Only ArrayCommands should show up as arguments
196
+ constructor_args.push(arg.accept(translator.expression_translator))
197
+ end
198
+ end
199
+
200
+ all_args = constructor_args.join(", ")
201
+
202
+ # This is a hack because the type is a pointer type
203
+ "new #{result_type.singleton_type.to_c_type[0...-2]}(#{all_args})"
204
+ end
205
+
206
+ ALL_LOCATION_AWARE_ARRAY_TYPES = proc do |type|
207
+ type.is_a?(Types::LocationAwareArrayType)
208
+ end
209
+
210
+ LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE = proc do |rcvr_type, *args_types|
211
+ # TODO: Should also be able to handle variable variant
212
+ Types::LocationAwareFixedSizeArrayType.new(
213
+ rcvr_type.inner_type,
214
+ rcvr_type.dimensions,
215
+ location: :host).to_union_type
216
+ end
217
+
218
+ LOCATION_AWARE_ARRAY_CALL_TYPE = proc do |rcvr_type, *args_types|
219
+ # Calling `__call__` on an array does not do anything
220
+ rcvr_type.to_union_type
221
+ end
222
+
223
+ COPY_ARRAY_TO_HOST = proc do |receiver, method_name, args, translator, result_type|
224
+ if receiver.get_type.singleton_type.location == :host
225
+ receiver.accept(translator.expression_translator)
226
+ else
227
+ c_type = receiver.get_type.singleton_type.inner_type.to_c_type
228
+
229
+ Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
230
+ "type" => c_type,
231
+ "device_array" => receiver.accept(translator.expression_translator)})
232
+ end
233
+ end
234
+
235
+ ARRAY_TYPE_TO_COMMAND_TYPE = proc do |rcvr_type, *args_types, send_node:|
236
+ rcvr_type.to_command.to_union_type
237
+ end
238
+
239
+ FREE_MEMORY_FOR_ARRAY_COMMAND = proc do |receiver, method_name, args, translator, result_type|
240
+
241
+ Translator.read_file(file_name: "free_memory_for_command.cpp", replacements: {
242
+ "type" => receiver.get_type.to_c_type,
243
+ "receiver" => receiver.accept(translator.expression_translator)})
244
+ end
245
+
246
+ # Manually free memory
247
+ # TODO: Implement escape analysis and try to reuse memory
248
+ implement(
249
+ ALL_ARRAY_COMMAND_TYPES,
250
+ :free_memory,
251
+ BOOL,
252
+ 0,
253
+ FREE_MEMORY_FOR_ARRAY_COMMAND)
254
+
255
+ # Implement all parallel operations
256
+ implement(
257
+ ALL_ARRAY_COMMAND_TYPES,
258
+ :pmap,
259
+ PMAP_TYPE,
260
+ 0..1,
261
+ SYMBOLICALLY_EXECUTE_KERNEL)
262
+
263
+ implement(
264
+ ALL_LOCATION_AWARE_ARRAY_TYPES,
265
+ :to_command,
266
+ ARRAY_TYPE_TO_COMMAND_TYPE,
267
+ 0,
268
+ SYMBOLICALLY_EXECUTE_KERNEL)
269
+
270
+ implement(
271
+ ALL_ARRAY_COMMAND_TYPES,
272
+ :pzip,
273
+ PZIP_TYPE,
274
+ 1,
275
+ SYMBOLICALLY_EXECUTE_KERNEL,
276
+ expect_singleton_args: true)
277
+
278
+ implement(
279
+ ALL_ARRAY_COMMAND_TYPES,
280
+ :pstencil,
281
+ PSTENCIL_TYPE,
282
+ 2..3, # neighborhood and default value, maybe hash
283
+ SYMBOLICALLY_EXECUTE_KERNEL)
284
+
285
+ implement(
286
+ ALL_ARRAY_COMMAND_TYPES,
287
+ :preduce,
288
+ PREDUCE_TYPE,
289
+ 0,
290
+ SYMBOLICALLY_EXECUTE_KERNEL)
291
+
292
+ implement(
293
+ ALL_ARRAY_COMMAND_TYPES,
294
+ :__call__,
295
+ ARRAY_COMMAND_TO_ARRAY_TYPE,
296
+ 0,
297
+ LAUNCH_KERNEL)
298
+
299
+ implement(
300
+ ALL_LOCATION_AWARE_ARRAY_TYPES,
301
+ :__to_host_array__,
302
+ LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE,
303
+ 0,
304
+ COPY_ARRAY_TO_HOST)
305
+
306
+ implement(
307
+ ALL_LOCATION_AWARE_ARRAY_TYPES,
308
+ :__call__,
309
+ LOCATION_AWARE_ARRAY_CALL_TYPE,
310
+ 0,
311
+ "#0")
312
+ end
313
+ end