ikra 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ast/builder.rb +225 -77
  3. data/lib/ast/host_section_builder.rb +38 -0
  4. data/lib/ast/interpreter.rb +67 -0
  5. data/lib/ast/lexical_variables_enumerator.rb +3 -2
  6. data/lib/ast/nodes.rb +521 -31
  7. data/lib/ast/printer.rb +116 -18
  8. data/lib/ast/ssa_generator.rb +192 -0
  9. data/lib/ast/visitor.rb +235 -21
  10. data/lib/config/configuration.rb +28 -3
  11. data/lib/config/os_configuration.rb +62 -9
  12. data/lib/cpu/cpu_implementation.rb +39 -0
  13. data/lib/ikra.rb +13 -3
  14. data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
  15. data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
  16. data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
  17. data/lib/resources/cuda/ast/assignment.cpp +1 -0
  18. data/lib/resources/cuda/block_function_head.cpp +7 -1
  19. data/lib/resources/cuda/entry_point.cpp +47 -0
  20. data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
  21. data/lib/resources/cuda/free_device_memory.cpp +3 -0
  22. data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
  23. data/lib/resources/cuda/header.cpp +23 -9
  24. data/lib/resources/cuda/header_structs.cpp +92 -0
  25. data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
  26. data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
  27. data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
  28. data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
  29. data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
  30. data/lib/resources/cuda/kernel.cpp +9 -2
  31. data/lib/resources/cuda/launch_kernel.cpp +5 -0
  32. data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
  33. data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
  34. data/lib/resources/cuda/reduce_body.cpp +88 -0
  35. data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
  36. data/lib/resources/cuda/stencil_body.cpp +16 -0
  37. data/lib/resources/cuda/struct_definition.cpp +4 -0
  38. data/lib/ruby_core/array.rb +34 -0
  39. data/lib/ruby_core/array_command.rb +313 -0
  40. data/lib/ruby_core/core.rb +103 -0
  41. data/lib/ruby_core/interpreter.rb +16 -0
  42. data/lib/ruby_core/math.rb +32 -0
  43. data/lib/ruby_core/ruby_integration.rb +256 -0
  44. data/lib/symbolic/host_section.rb +115 -0
  45. data/lib/symbolic/input.rb +87 -0
  46. data/lib/symbolic/input_visitor.rb +68 -0
  47. data/lib/symbolic/symbolic.rb +793 -117
  48. data/lib/symbolic/visitor.rb +70 -8
  49. data/lib/translator/array_command_struct_builder.rb +163 -0
  50. data/lib/translator/ast_translator.rb +572 -0
  51. data/lib/translator/block_translator.rb +104 -48
  52. data/lib/translator/commands/array_combine_command.rb +41 -0
  53. data/lib/translator/commands/array_identity_command.rb +28 -0
  54. data/lib/translator/commands/array_index_command.rb +52 -0
  55. data/lib/translator/commands/array_reduce_command.rb +135 -0
  56. data/lib/translator/commands/array_stencil_command.rb +129 -0
  57. data/lib/translator/commands/array_zip_command.rb +30 -0
  58. data/lib/translator/commands/command_translator.rb +264 -0
  59. data/lib/translator/cuda_errors.rb +32 -0
  60. data/lib/translator/environment_builder.rb +263 -0
  61. data/lib/translator/host_section/array_host_section_command.rb +150 -0
  62. data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
  63. data/lib/translator/host_section/ast_translator.rb +14 -0
  64. data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
  65. data/lib/translator/host_section/program_builder.rb +89 -0
  66. data/lib/translator/input_translator.rb +226 -0
  67. data/lib/translator/kernel_builder.rb +137 -0
  68. data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
  69. data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
  70. data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
  71. data/lib/translator/last_returns_visitor.rb +19 -10
  72. data/lib/translator/program_builder.rb +197 -0
  73. data/lib/translator/program_launcher.rb +273 -0
  74. data/lib/translator/struct_type.rb +55 -0
  75. data/lib/translator/translator.rb +34 -11
  76. data/lib/translator/variable_classifier_visitor.rb +56 -0
  77. data/lib/types/inference/ast_inference.rb +586 -0
  78. data/lib/types/inference/clear_types_visitor.rb +11 -0
  79. data/lib/types/inference/command_inference.rb +101 -0
  80. data/lib/types/inference/input_inference.rb +62 -0
  81. data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
  82. data/lib/types/inference/ruby_extension.rb +35 -0
  83. data/lib/types/inference/symbol_table.rb +131 -0
  84. data/lib/types/types.rb +14 -0
  85. data/lib/types/types/array_command_type.rb +123 -0
  86. data/lib/types/types/array_type.rb +137 -0
  87. data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
  88. data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
  89. data/lib/types/types/ruby_type.rb +88 -0
  90. data/lib/types/types/struct_type.rb +179 -0
  91. data/lib/types/types/union_type.rb +239 -0
  92. metadata +160 -18
  93. data/lib/ast/method_definition.rb +0 -37
  94. data/lib/ast/translator.rb +0 -264
  95. data/lib/resources/cuda/kernel_launcher.cpp +0 -28
  96. data/lib/scope.rb +0 -166
  97. data/lib/translator/command_translator.rb +0 -421
  98. data/lib/translator/local_variables_enumerator.rb +0 -35
  99. data/lib/translator/method_translator.rb +0 -24
  100. data/lib/types/array_type.rb +0 -51
  101. data/lib/types/ruby_extension.rb +0 -67
  102. data/lib/types/ruby_type.rb +0 -45
  103. data/lib/types/type_inference.rb +0 -382
  104. data/lib/types/union_type.rb +0 -155
@@ -0,0 +1,12 @@
1
+ #undef checkErrorReturn
2
+ #define checkErrorReturn(result_var, expr) \
3
+ if (result_var->last_error = expr) \
4
+ {\
5
+ cudaError_t error = cudaGetLastError();\
6
+ printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
7
+ cudaDeviceReset();\
8
+ return /*{result_type}*/::error_return_value;\
9
+ }
10
+
11
+ /*{result_type}*/ /*{name}*/(/*{parameters}*/)
12
+ /*{body}*/
@@ -0,0 +1,55 @@
1
+ #undef checkErrorReturn
2
+ #define checkErrorReturn(result_var, expr) \
3
+ if (result_var->last_error = expr) \
4
+ {\
5
+ cudaError_t error = cudaGetLastError();\
6
+ printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
7
+ cudaDeviceReset();\
8
+ return result_var;\
9
+ }
10
+
11
+ extern "C" EXPORT result_t *launch_kernel(environment_t */*{host_env_var_name}*/)
12
+ {
13
+ // CUDA Initialization
14
+ program_result = new result_t();
15
+ program_result->device_allocations = new vector<void*>();
16
+
17
+ timeStartMeasure();
18
+
19
+ cudaError_t cudaStatus = cudaSetDevice(0);
20
+
21
+ if (cudaStatus != cudaSuccess) {
22
+ fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
23
+ program_result->last_error = -1;
24
+ return program_result;
25
+ }
26
+
27
+ checkErrorReturn(program_result, cudaFree(0));
28
+
29
+ timeReportMeasure(program_result, setup_cuda);
30
+
31
+
32
+ /* Prepare environment */
33
+ /*{prepare_environment}*/
34
+
35
+
36
+ /* Copy back memory and set pointer of result */
37
+ program_result->result = /*{host_result_array}*/;
38
+
39
+ /* Free device memory */
40
+ timeStartMeasure();
41
+
42
+ for (
43
+ auto device_ptr = program_result->device_allocations->begin();
44
+ device_ptr < program_result->device_allocations->end();
45
+ device_ptr++)
46
+ {
47
+ checkErrorReturn(program_result, cudaFree(*device_ptr));
48
+ }
49
+
50
+ delete program_result->device_allocations;
51
+
52
+ timeReportMeasure(program_result, free_memory);
53
+
54
+ return program_result;
55
+ }
@@ -0,0 +1,18 @@
1
+ timeStartMeasure();
2
+
3
+ if (/*{name}*/ != cmd->result) {
4
+ // Don't free memory if it is the result. There is already a similar check in
5
+ // program_builder (free all except for last). However, this check is not sufficient in
6
+ // case the same array is reused!
7
+
8
+ checkErrorReturn(program_result, cudaFree(/*{name}*/));
9
+ // Remove from list of allocations
10
+ program_result->device_allocations->erase(
11
+ std::remove(
12
+ program_result->device_allocations->begin(),
13
+ program_result->device_allocations->end(),
14
+ /*{name}*/),
15
+ program_result->device_allocations->end());
16
+ }
17
+
18
+ timeReportMeasure(program_result, free_memory);
@@ -0,0 +1,14 @@
1
+ ({
2
+ // /*{debug_information}*/
3
+
4
+ /*{array_command_type}*/ cmd = /*{array_command}*/;
5
+
6
+ if (cmd->result == 0) {
7
+ /*{kernel_invocation}*/
8
+ cmd->result = /*{kernel_result}*/;
9
+
10
+ /*{free_memory}*/
11
+ }
12
+
13
+ variable_size_array_t((void *) cmd->result, /*{result_size}*/);
14
+ })
@@ -0,0 +1,10 @@
1
+
2
+ {
3
+ /*{type}*/ * tmp_result = (/*{type}*/ *) malloc(/*{bytes}*/);
4
+
5
+ timeStartMeasure();
6
+ checkErrorReturn(program_result, cudaMemcpy(tmp_result, program_result->result, /*{bytes}*/, cudaMemcpyDeviceToHost));
7
+ timeReportMeasure(program_result, transfer_memory);
8
+
9
+ program_result->result = tmp_result;
10
+ }
@@ -1,8 +1,15 @@
1
1
 
2
2
 
3
- __global__ void kernel(environment_t */*{env_identifier}*/, /*{result_type}*/ *_result_)
3
+ __global__ void /*{kernel_name}*/(/*{parameters}*/)
4
4
  {
5
- _result_[threadIdx.x + blockIdx.x * blockDim.x] = /*{block_invocation}*/;
5
+ int _tid_ = threadIdx.x + blockIdx.x * blockDim.x;
6
+
7
+ if (_tid_ < /*{num_threads}*/)
8
+ {
9
+ /*{execution}*/
10
+
11
+ _result_[_tid_] = /*{block_invocation}*/;
12
+ }
6
13
  }
7
14
 
8
15
 
@@ -0,0 +1,5 @@
1
+ timeStartMeasure();
2
+ /*{kernel_name}*/<<</*{grid_dim}*/, /*{block_dim}*/>>>(/*{arguments}*/);
3
+ checkErrorReturn(program_result, cudaPeekAtLastError());
4
+ checkErrorReturn(program_result, cudaThreadSynchronize());
5
+ timeReportMeasure(program_result, kernel);
@@ -0,0 +1,3 @@
1
+ timeStartMeasure();
2
+ checkErrorReturn(program_result, cudaMemcpy(/*{host_name}*/, /*{device_name}*/, /*{bytes}*/, cudaMemcpyDeviceToHost));
3
+ timeReportMeasure(program_result, transfer_memory);
@@ -0,0 +1,10 @@
1
+ ({
2
+ variable_size_array_t device_array = /*{device_array}*/;
3
+ /*{type}*/ * tmp_result = (/*{type}*/ *) malloc(sizeof(/*{type}*/) * device_array.size);
4
+
5
+ timeStartMeasure();
6
+ checkErrorReturn(program_result, cudaMemcpy(tmp_result, device_array.content, sizeof(/*{type}*/) * device_array.size, cudaMemcpyDeviceToHost));
7
+ timeReportMeasure(program_result, transfer_memory);
8
+
9
+ variable_size_array_t((void *) tmp_result, device_array.size);
10
+ })
@@ -0,0 +1,88 @@
1
+ int thread_idx = threadIdx.x;
2
+
3
+ // Single result of this block
4
+ /*{type}*/ /*{temp_result}*/;
5
+
6
+ int num_args = 2 * /*{block_size}*/;
7
+ if (blockIdx.x == gridDim.x - 1)
8
+ {
9
+ // Processing the last block, which might be odd (number of elements to reduce).
10
+ // Other blocks cannot be "odd", because every block reduces 2*block_size many elements.
11
+
12
+ // Number of elements to reduce in the last block
13
+ num_args = ((2 * /*{num_threads}*/ - 1) % (2 * /*{block_size}*/)) + (/*{odd}*/ ? 0 : 1);
14
+ }
15
+
16
+ if (num_args == 1)
17
+ {
18
+ /*{temp_result}*/ = /*{previous_result}*/[_tid_];
19
+ }
20
+ else if (num_args == 2)
21
+ {
22
+ /*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
23
+ }
24
+ else
25
+ {
26
+ // Allocate block_size many slots to contain the result of up to block_size many reductions, i.e.,
27
+ // this array contains the reduction of (up to) 2*block_size many elements.
28
+ __shared__ /*{type}*/ sdata[/*{block_size}*/];
29
+
30
+ /*{odd}*/ = num_args % 2 == 1;
31
+
32
+ // --- FIRST REDUCTION --- Load from global memory
33
+ // Number of elements after the first reduction
34
+ num_args = num_args / 2 + num_args % 2;
35
+
36
+ if (thread_idx == num_args - 1 && /*{odd}*/)
37
+ {
38
+ // This is the last thread, and it should reduce only one element.
39
+ sdata[thread_idx] = /*{previous_result}*/[_tid_];
40
+ }
41
+ else
42
+ {
43
+ sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
44
+ }
45
+
46
+ __syncthreads();
47
+
48
+
49
+ // --- SUBSEQUENT REDUCTION --- Read from shared memory only
50
+ /*{odd}*/ = num_args % 2 == 1;
51
+
52
+ for (
53
+ num_args = num_args / 2 + num_args % 2; // Number of elements after this reduction
54
+ num_args > 1; // ... as long as there's at least 3 elements left
55
+ num_args = num_args / 2 + num_args % 2) {
56
+
57
+ if (thread_idx < num_args) {
58
+ // This thread has work to do...
59
+
60
+ if (thread_idx != num_args - 1 || !/*{odd}*/)
61
+ {
62
+ sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, sdata[thread_idx], sdata[thread_idx + num_args]);
63
+ }
64
+ else
65
+ {
66
+ // This is the last element and it is odd, do nothing
67
+ }
68
+ }
69
+
70
+ __syncthreads();
71
+
72
+ /*{odd}*/ = num_args % 2 == 1;
73
+ }
74
+
75
+ if (thread_idx == 0)
76
+ {
77
+ // Last thread returns result
78
+ /*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, sdata[0], sdata[1]);
79
+ }
80
+ }
81
+
82
+ // Write result to different position
83
+ _tid_ = blockIdx.x;
84
+
85
+ if (thread_idx != 0) {
86
+ // Only one thread should report the result
87
+ return;
88
+ }
@@ -0,0 +1,2 @@
1
+ // (Re)construct array from separately passed parameters
2
+ /*{type}*/ /*{name}*/[] = /*{initializer}*/;
@@ -0,0 +1,16 @@
1
+ /*{result_type}*/ /*{temp_var}*/;
2
+
3
+ // Indices for all dimensions
4
+ /*{compute_indices}*/
5
+
6
+ if (/*{out_of_bounds_check}*/)
7
+ {
8
+ // All value indices within bounds
9
+ /*{execution}*/
10
+ /*{temp_var}*/ = /*{stencil_computation}*/;
11
+ }
12
+ else
13
+ {
14
+ // At least one index is out of bounds
15
+ /*{temp_var}*/ = /*{out_of_bounds_fallback}*/;
16
+ }
@@ -0,0 +1,4 @@
1
+ struct /*{name}*/
2
+ {
3
+ /*{fields}*/
4
+ };
@@ -0,0 +1,34 @@
1
+ module Ikra
2
+ module RubyIntegration
3
+ ALL_ARRAY_TYPES = proc do |type|
4
+ type.is_a?(Types::ArrayType) && !type.is_a?(Types::LocationAwareArrayType)
5
+ end
6
+
7
+ LOCATION_AWARE_ARRAY_TYPE = proc do |type|
8
+ # TODO: Maybe there should be an automated transfer to host side here if necessary?
9
+ type.is_a?(Types::LocationAwareArrayType)
10
+ end
11
+
12
+ LOCATION_AWARE_ARRAY_ACCESS = proc do |receiver, method_name, args, translator, result_type|
13
+
14
+ recv = receiver.accept(translator.expression_translator)
15
+ inner_type = receiver.get_type.singleton_type.inner_type.to_c_type
16
+ index = args[0].accept(translator.expression_translator)
17
+
18
+ "((#{inner_type} *) #{recv}.content)[#{index}]"
19
+ end
20
+
21
+ INNER_TYPE = proc do |rcvr|
22
+ rcvr.inner_type
23
+ end
24
+
25
+ implement ALL_ARRAY_TYPES, :[], INNER_TYPE, 1, "#0[#I1]"
26
+
27
+ implement(
28
+ LOCATION_AWARE_ARRAY_TYPE,
29
+ :[],
30
+ INNER_TYPE,
31
+ 1,
32
+ LOCATION_AWARE_ARRAY_ACCESS)
33
+ end
34
+ end
@@ -0,0 +1,313 @@
1
+ require_relative "../types/types/array_type.rb"
2
+ require_relative "../ast/interpreter.rb"
3
+
4
+ module Ikra
5
+ module RubyIntegration
6
+
7
+ # This visitor traverses the tree of symbolically executed parallel operations. It raises
8
+ # an exception, if an array command was generated by symbolic execution/interpretation of
9
+ # `send_node`.
10
+ class SymbolicCycleFinder < Symbolic::Visitor
11
+ def self.raise_on_cycle(command, send_node)
12
+ visitor = self.new(send_node)
13
+ command.accept(visitor)
14
+ end
15
+
16
+ def initialize(send_node)
17
+ @send_node = send_node
18
+ end
19
+
20
+ def visit_array_command(node)
21
+ if node.generator_node == @send_node
22
+ raise CycleDetectedError.new(node: node)
23
+ else
24
+ # No cycle found yet, check dependent computations
25
+ super
26
+ end
27
+ end
28
+ end
29
+
30
+ class CycleDetectedError < RuntimeError
31
+ def initialize(node:)
32
+ @node = node
33
+ end
34
+ end
35
+
36
+ ALL_ARRAY_COMMAND_TYPES = proc do |type|
37
+ type.is_a?(Symbolic::ArrayCommand)
38
+ end
39
+
40
+ PMAP_TYPE = proc do |rcvr_type, *args_types, send_node:|
41
+ # TODO: Handle keyword arguments
42
+
43
+ # Ensure that there is no cycle here. "Cycle" means that the same AST send node
44
+ # was used earlier (i.e., in one of `rcvr_type`'s inputs/dependent computations).
45
+ # In that case we have to abort type inference here, because it would not terminate.
46
+ SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
47
+
48
+ more_kw_args = {}
49
+
50
+ if send_node.arguments.size == 1
51
+ if !send_node.arguments.first.is_a?(AST::HashNode)
52
+ raise ArgumentError.new("If an argument is given, it must be a Hash of kwargs.")
53
+ end
54
+
55
+ # Pass kwargs separately
56
+ more_kw_args = AST::Interpreter.interpret(send_node.arguments.first)
57
+ end
58
+
59
+ rcvr_type.pmap(
60
+ ast: send_node.block_argument,
61
+ generator_node: send_node,
62
+ # TODO: Fix binding
63
+ command_binding: send_node.find_behavior_node.binding,
64
+ **more_kw_args).to_union_type
65
+ end
66
+
67
+ PZIP_TYPE = proc do |rcvr_type, *args_types, send_node:|
68
+ # TODO: Support multiple arguments for `pzip`
69
+ types = args_types[0].map do |sing_type|
70
+ raise AssertionError.new("Singleton type expected") if sing_type.is_union_type?
71
+ rcvr_type.pzip(sing_type, generator_node: send_node).to_union_type
72
+ end
73
+
74
+ types.reduce(Types::UnionType.new) do |acc, type|
75
+ acc.expand_return_type(type)
76
+ end
77
+ end
78
+
79
+ PSTENCIL_TYPE = proc do |rcvr_type, *args_types, send_node:|
80
+ # TODO: Handle keyword arguments
81
+ ruby_args = send_node.arguments.map do |node|
82
+ AST::Interpreter.interpret(node)
83
+ end
84
+
85
+ more_kw_args = {}
86
+
87
+ if args_types.size == 3
88
+ if !ruby_args.last.is_a?(Hash)
89
+ raise ArgumentError.new("If 3 arguments are given, the last one must be a Hash of kwargs.")
90
+ end
91
+
92
+ # Pass kwargs separately
93
+ more_kw_args = ruby_args.pop
94
+ end
95
+
96
+ SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
97
+
98
+ rcvr_type.pstencil(
99
+ *ruby_args,
100
+ ast: send_node.block_argument,
101
+ generator_node: send_node,
102
+ # TODO: Fix binding
103
+ command_binding: send_node.find_behavior_node.binding,
104
+ **more_kw_args).to_union_type
105
+ end
106
+
107
+ PREDUCE_TYPE = proc do |rcvr_type, *args_types, send_node:|
108
+ # TODO: Handle keyword arguments
109
+
110
+ SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
111
+
112
+ rcvr_type.preduce(ast: send_node.block_argument, generator_node: send_node).to_union_type
113
+ end
114
+
115
+ LAUNCH_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
116
+ # The result type is the symbolically executed result of applying this
117
+ # parallel section. The result type is an ArrayCommand.
118
+ array_command = receiver.get_type.singleton_type
119
+
120
+ # Translate command
121
+ command_translator = translator.command_translator
122
+ command_translator.push_kernel_launcher
123
+ result = array_command.accept(command_translator)
124
+ kernel_launcher = command_translator.pop_kernel_launcher(result)
125
+
126
+ # Prepare kernel launchers for launch of `array_command`
127
+ command_translator.program_builder.prepare_additional_args_for_launch(array_command)
128
+
129
+ # Generate launch code for all kernels
130
+ launch_code = command_translator.program_builder.build_kernel_launchers
131
+
132
+ # Always return a device pointer. Only at the very end, we transfer data to the host.
133
+ result_expr = kernel_launcher.kernel_result_var_name
134
+
135
+ if Translator::ArrayCommandStructBuilder::RequireRuntimeSizeChecker.require_size_function?(array_command)
136
+
137
+ # Size is not statically known, take information from receiver.
138
+ # TODO: Code depends on template. `cmd` is defined in template.
139
+ result_size = "cmd->size()"
140
+ else
141
+ # Size is known statically
142
+ result_size = array_command.size.to_s
143
+ end
144
+
145
+ # Debug information
146
+ if array_command.generator_node != nil
147
+ debug_information = array_command.to_s + ": " + array_command.generator_node.to_s
148
+ else
149
+ debug_information = array_command.to_s
150
+ end
151
+
152
+ result = Translator.read_file(file_name: "host_section_launch_parallel_section.cpp", replacements: {
153
+ "debug_information" => debug_information,
154
+ "array_command" => receiver.accept(translator.expression_translator),
155
+ "array_command_type" => array_command.to_c_type,
156
+ "result_size" => result_size,
157
+ "kernel_invocation" => launch_code,
158
+ "kernel_result" => result_expr,
159
+ "free_memory" => command_translator.program_builder.build_memory_free_except_last})
160
+
161
+ # Clear kernel launchers. Otherwise, we might launch them again in a later, unrelated
162
+ # LAUNCH_KERNEL branch. This is because we reuse the same [ProgramBuilder] for an
163
+ # entire host section.
164
+ command_translator.program_builder.clear_kernel_launchers
165
+
166
+ # Build all array command structs for this command
167
+ command_translator.program_builder.add_array_command_struct(
168
+ *Translator::ArrayCommandStructBuilder.build_all_structs(array_command))
169
+
170
+ result
171
+ end
172
+
173
+ ARRAY_COMMAND_TO_ARRAY_TYPE = proc do |rcvr_type, *args_types, send_node:|
174
+ Types::LocationAwareFixedSizeArrayType.new(
175
+ rcvr_type.result_type,
176
+ rcvr_type.dimensions,
177
+ location: :device).to_union_type
178
+ end
179
+
180
+ SYMBOLICALLY_EXECUTE_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
181
+ if !result_type.is_singleton?
182
+ raise AssertionError.new("Singleton type expected")
183
+ end
184
+
185
+ # Build arguments to constructor. First one (result field) is NULL.
186
+ constructor_args = ["NULL"]
187
+
188
+ # Translate all inputs (receiver, then arguments to parallel section)
189
+ constructor_args.push(receiver.accept(translator.expression_translator))
190
+
191
+ for arg in arguments
192
+ if arg.get_type.is_singleton? &&
193
+ arg.get_type.singleton_type.is_a?(Symbolic::ArrayCommand)
194
+
195
+ # Only ArrayCommands should show up as arguments
196
+ constructor_args.push(arg.accept(translator.expression_translator))
197
+ end
198
+ end
199
+
200
+ all_args = constructor_args.join(", ")
201
+
202
+ # This is a hack because the type is a pointer type
203
+ "new #{result_type.singleton_type.to_c_type[0...-2]}(#{all_args})"
204
+ end
205
+
206
+ ALL_LOCATION_AWARE_ARRAY_TYPES = proc do |type|
207
+ type.is_a?(Types::LocationAwareArrayType)
208
+ end
209
+
210
+ LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE = proc do |rcvr_type, *args_types|
211
+ # TODO: Should also be able to handle variable variant
212
+ Types::LocationAwareFixedSizeArrayType.new(
213
+ rcvr_type.inner_type,
214
+ rcvr_type.dimensions,
215
+ location: :host).to_union_type
216
+ end
217
+
218
+ LOCATION_AWARE_ARRAY_CALL_TYPE = proc do |rcvr_type, *args_types|
219
+ # Calling `__call__` on an array does not do anything
220
+ rcvr_type.to_union_type
221
+ end
222
+
223
+ COPY_ARRAY_TO_HOST = proc do |receiver, method_name, args, translator, result_type|
224
+ if receiver.get_type.singleton_type.location == :host
225
+ receiver.accept(translator.expression_translator)
226
+ else
227
+ c_type = receiver.get_type.singleton_type.inner_type.to_c_type
228
+
229
+ Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
230
+ "type" => c_type,
231
+ "device_array" => receiver.accept(translator.expression_translator)})
232
+ end
233
+ end
234
+
235
+ ARRAY_TYPE_TO_COMMAND_TYPE = proc do |rcvr_type, *args_types, send_node:|
236
+ rcvr_type.to_command.to_union_type
237
+ end
238
+
239
+ FREE_MEMORY_FOR_ARRAY_COMMAND = proc do |receiver, method_name, args, translator, result_type|
240
+
241
+ Translator.read_file(file_name: "free_memory_for_command.cpp", replacements: {
242
+ "type" => receiver.get_type.to_c_type,
243
+ "receiver" => receiver.accept(translator.expression_translator)})
244
+ end
245
+
246
+ # Manually free memory
247
+ # TODO: Implement escape analysis and try to reuse memory
248
+ implement(
249
+ ALL_ARRAY_COMMAND_TYPES,
250
+ :free_memory,
251
+ BOOL,
252
+ 0,
253
+ FREE_MEMORY_FOR_ARRAY_COMMAND)
254
+
255
+ # Implement all parallel operations
256
+ implement(
257
+ ALL_ARRAY_COMMAND_TYPES,
258
+ :pmap,
259
+ PMAP_TYPE,
260
+ 0..1,
261
+ SYMBOLICALLY_EXECUTE_KERNEL)
262
+
263
+ implement(
264
+ ALL_LOCATION_AWARE_ARRAY_TYPES,
265
+ :to_command,
266
+ ARRAY_TYPE_TO_COMMAND_TYPE,
267
+ 0,
268
+ SYMBOLICALLY_EXECUTE_KERNEL)
269
+
270
+ implement(
271
+ ALL_ARRAY_COMMAND_TYPES,
272
+ :pzip,
273
+ PZIP_TYPE,
274
+ 1,
275
+ SYMBOLICALLY_EXECUTE_KERNEL,
276
+ expect_singleton_args: true)
277
+
278
+ implement(
279
+ ALL_ARRAY_COMMAND_TYPES,
280
+ :pstencil,
281
+ PSTENCIL_TYPE,
282
+ 2..3, # neighborhood and default value, maybe hash
283
+ SYMBOLICALLY_EXECUTE_KERNEL)
284
+
285
+ implement(
286
+ ALL_ARRAY_COMMAND_TYPES,
287
+ :preduce,
288
+ PREDUCE_TYPE,
289
+ 0,
290
+ SYMBOLICALLY_EXECUTE_KERNEL)
291
+
292
+ implement(
293
+ ALL_ARRAY_COMMAND_TYPES,
294
+ :__call__,
295
+ ARRAY_COMMAND_TO_ARRAY_TYPE,
296
+ 0,
297
+ LAUNCH_KERNEL)
298
+
299
+ implement(
300
+ ALL_LOCATION_AWARE_ARRAY_TYPES,
301
+ :__to_host_array__,
302
+ LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE,
303
+ 0,
304
+ COPY_ARRAY_TO_HOST)
305
+
306
+ implement(
307
+ ALL_LOCATION_AWARE_ARRAY_TYPES,
308
+ :__call__,
309
+ LOCATION_AWARE_ARRAY_CALL_TYPE,
310
+ 0,
311
+ "#0")
312
+ end
313
+ end