RubyGems - ikra - Versions diffs - 0.0.1 → 0.0.2 - Mend

ikra 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

checksums.yaml +4 -4
data/lib/ast/builder.rb +225 -77
data/lib/ast/host_section_builder.rb +38 -0
data/lib/ast/interpreter.rb +67 -0
data/lib/ast/lexical_variables_enumerator.rb +3 -2
data/lib/ast/nodes.rb +521 -31
data/lib/ast/printer.rb +116 -18
data/lib/ast/ssa_generator.rb +192 -0
data/lib/ast/visitor.rb +235 -21
data/lib/config/configuration.rb +28 -3
data/lib/config/os_configuration.rb +62 -9
data/lib/cpu/cpu_implementation.rb +39 -0
data/lib/ikra.rb +13 -3
data/lib/resources/cuda/allocate_device_memory.cpp +5 -0
data/lib/resources/cuda/allocate_host_memory.cpp +1 -0
data/lib/resources/cuda/allocate_memcpy_environment_to_device.cpp +11 -0
data/lib/resources/cuda/ast/assignment.cpp +1 -0
data/lib/resources/cuda/block_function_head.cpp +7 -1
data/lib/resources/cuda/entry_point.cpp +47 -0
data/lib/resources/cuda/env_builder_copy_array.cpp +8 -2
data/lib/resources/cuda/free_device_memory.cpp +3 -0
data/lib/resources/cuda/free_memory_for_command.cpp +24 -0
data/lib/resources/cuda/header.cpp +23 -9
data/lib/resources/cuda/header_structs.cpp +92 -0
data/lib/resources/cuda/host_section_block_function_head.cpp +12 -0
data/lib/resources/cuda/host_section_entry_point.cpp +55 -0
data/lib/resources/cuda/host_section_free_device_memory.cpp +18 -0
data/lib/resources/cuda/host_section_launch_parallel_section.cpp +14 -0
data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp +10 -0
data/lib/resources/cuda/kernel.cpp +9 -2
data/lib/resources/cuda/launch_kernel.cpp +5 -0
data/lib/resources/cuda/memcpy_device_to_host.cpp +3 -0
data/lib/resources/cuda/memcpy_device_to_host_expr.cpp +10 -0
data/lib/resources/cuda/reduce_body.cpp +88 -0
data/lib/resources/cuda/stencil_array_reconstruction.cpp +2 -0
data/lib/resources/cuda/stencil_body.cpp +16 -0
data/lib/resources/cuda/struct_definition.cpp +4 -0
data/lib/ruby_core/array.rb +34 -0
data/lib/ruby_core/array_command.rb +313 -0
data/lib/ruby_core/core.rb +103 -0
data/lib/ruby_core/interpreter.rb +16 -0
data/lib/ruby_core/math.rb +32 -0
data/lib/ruby_core/ruby_integration.rb +256 -0
data/lib/symbolic/host_section.rb +115 -0
data/lib/symbolic/input.rb +87 -0
data/lib/symbolic/input_visitor.rb +68 -0
data/lib/symbolic/symbolic.rb +793 -117
data/lib/symbolic/visitor.rb +70 -8
data/lib/translator/array_command_struct_builder.rb +163 -0
data/lib/translator/ast_translator.rb +572 -0
data/lib/translator/block_translator.rb +104 -48
data/lib/translator/commands/array_combine_command.rb +41 -0
data/lib/translator/commands/array_identity_command.rb +28 -0
data/lib/translator/commands/array_index_command.rb +52 -0
data/lib/translator/commands/array_reduce_command.rb +135 -0
data/lib/translator/commands/array_stencil_command.rb +129 -0
data/lib/translator/commands/array_zip_command.rb +30 -0
data/lib/translator/commands/command_translator.rb +264 -0
data/lib/translator/cuda_errors.rb +32 -0
data/lib/translator/environment_builder.rb +263 -0
data/lib/translator/host_section/array_host_section_command.rb +150 -0
data/lib/translator/host_section/array_in_host_section_command.rb +41 -0
data/lib/translator/host_section/ast_translator.rb +14 -0
data/lib/translator/host_section/parallel_section_invocation_visitor.rb +20 -0
data/lib/translator/host_section/program_builder.rb +89 -0
data/lib/translator/input_translator.rb +226 -0
data/lib/translator/kernel_builder.rb +137 -0
data/lib/translator/kernel_launcher/for_loop_kernel_launcher.rb +40 -0
data/lib/translator/kernel_launcher/kernel_launcher.rb +259 -0
data/lib/translator/kernel_launcher/while_loop_kernel_launcher.rb +38 -0
data/lib/translator/last_returns_visitor.rb +19 -10
data/lib/translator/program_builder.rb +197 -0
data/lib/translator/program_launcher.rb +273 -0
data/lib/translator/struct_type.rb +55 -0
data/lib/translator/translator.rb +34 -11
data/lib/translator/variable_classifier_visitor.rb +56 -0
data/lib/types/inference/ast_inference.rb +586 -0
data/lib/types/inference/clear_types_visitor.rb +11 -0
data/lib/types/inference/command_inference.rb +101 -0
data/lib/types/inference/input_inference.rb +62 -0
data/lib/types/{object_tracer.rb → inference/object_tracer.rb} +5 -6
data/lib/types/inference/ruby_extension.rb +35 -0
data/lib/types/inference/symbol_table.rb +131 -0
data/lib/types/types.rb +14 -0
data/lib/types/types/array_command_type.rb +123 -0
data/lib/types/types/array_type.rb +137 -0
data/lib/types/{class_type.rb → types/class_type.rb} +42 -18
data/lib/types/{primitive_type.rb → types/primitive_type.rb} +20 -7
data/lib/types/types/ruby_type.rb +88 -0
data/lib/types/types/struct_type.rb +179 -0
data/lib/types/types/union_type.rb +239 -0
metadata +160 -18
data/lib/ast/method_definition.rb +0 -37
data/lib/ast/translator.rb +0 -264
data/lib/resources/cuda/kernel_launcher.cpp +0 -28
data/lib/scope.rb +0 -166
data/lib/translator/command_translator.rb +0 -421
data/lib/translator/local_variables_enumerator.rb +0 -35
data/lib/translator/method_translator.rb +0 -24
data/lib/types/array_type.rb +0 -51
data/lib/types/ruby_extension.rb +0 -67
data/lib/types/ruby_type.rb +0 -45
data/lib/types/type_inference.rb +0 -382
data/lib/types/union_type.rb +0 -155

data/lib/resources/cuda/host_section_block_function_head.cpp ADDED

@@ -0,0 +1,12 @@
+#undef checkErrorReturn
+#define checkErrorReturn(result_var, expr) \
+if (result_var->last_error = expr) \
+{\
+    cudaError_t error = cudaGetLastError();\
+    printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
+    cudaDeviceReset();\
+    return /*{result_type}*/::error_return_value;\
+}
+/*{result_type}*/ /*{name}*/(/*{parameters}*/)
+/*{body}*/

data/lib/resources/cuda/host_section_entry_point.cpp ADDED

@@ -0,0 +1,55 @@
+#undef checkErrorReturn
+#define checkErrorReturn(result_var, expr) \
+if (result_var->last_error = expr) \
+{\
+    cudaError_t error = cudaGetLastError();\
+    printf("!!! Cuda Failure %s:%d (%i): '%s'\n", __FILE__, __LINE__, expr, cudaGetErrorString(error));\
+    cudaDeviceReset();\
+    return result_var;\
+}
+extern "C" EXPORT result_t *launch_kernel(environment_t */*{host_env_var_name}*/)
+{
+    // CUDA Initialization
+    program_result = new result_t();
+    program_result->device_allocations = new vector<void*>();
+    timeStartMeasure();
+    cudaError_t cudaStatus = cudaSetDevice(0);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
+        program_result->last_error = -1;
+        return program_result;
+    }
+    checkErrorReturn(program_result, cudaFree(0));
+    timeReportMeasure(program_result, setup_cuda);
+    /* Prepare environment */
+    /*{prepare_environment}*/
+    /* Copy back memory and set pointer of result */
+    program_result->result = /*{host_result_array}*/;
+    /* Free device memory */
+    timeStartMeasure();
+    for (
+        auto device_ptr = program_result->device_allocations->begin();
+        device_ptr < program_result->device_allocations->end();
+        device_ptr++)
+    {
+        checkErrorReturn(program_result, cudaFree(*device_ptr));
+    }
+    delete program_result->device_allocations;
+    timeReportMeasure(program_result, free_memory);
+    return program_result;
+}

data/lib/resources/cuda/host_section_free_device_memory.cpp ADDED

@@ -0,0 +1,18 @@
+    timeStartMeasure();
+    if (/*{name}*/ != cmd->result) {
+        // Don't free memory if it is the result. There is already a similar check in
+        // program_builder (free all except for last). However, this check is not sufficient in
+        // case the same array is reused!
+        checkErrorReturn(program_result, cudaFree(/*{name}*/));
+        // Remove from list of allocations
+        program_result->device_allocations->erase(
+            std::remove(
+                program_result->device_allocations->begin(),
+                program_result->device_allocations->end(),
+                /*{name}*/),
+            program_result->device_allocations->end());
+    }
+    timeReportMeasure(program_result, free_memory);

data/lib/resources/cuda/host_section_launch_parallel_section.cpp ADDED

@@ -0,0 +1,14 @@
+({
+    // /*{debug_information}*/
+    /*{array_command_type}*/ cmd = /*{array_command}*/;
+    if (cmd->result == 0) {
+        /*{kernel_invocation}*/
+        cmd->result = /*{kernel_result}*/;
+        /*{free_memory}*/
+    }
+    variable_size_array_t((void *) cmd->result, /*{result_size}*/);
+})

data/lib/resources/cuda/host_section_malloc_memcpy_device_to_host.cpp ADDED

@@ -0,0 +1,10 @@
+    {
+        /*{type}*/ * tmp_result = (/*{type}*/ *) malloc(/*{bytes}*/);
+        timeStartMeasure();
+        checkErrorReturn(program_result, cudaMemcpy(tmp_result, program_result->result, /*{bytes}*/, cudaMemcpyDeviceToHost));
+        timeReportMeasure(program_result, transfer_memory);
+        program_result->result = tmp_result;
+    }

data/lib/resources/cuda/kernel.cpp CHANGED

@@ -1,8 +1,15 @@
-__global__ void kernel(environment_t */*{env_identifier}*/, /*{result_type}*/ *_result_)
+__global__ void /*{kernel_name}*/(/*{parameters}*/)
 {
-    _result_[threadIdx.x + blockIdx.x * blockDim.x] = /*{block_invocation}*/;
+    int _tid_ = threadIdx.x + blockIdx.x * blockDim.x;
+    if (_tid_ < /*{num_threads}*/)
+    {
+/*{execution}*/
+        _result_[_tid_] = /*{block_invocation}*/;
+    }
 }

data/lib/resources/cuda/launch_kernel.cpp ADDED

@@ -0,0 +1,5 @@
+    timeStartMeasure();
+    /*{kernel_name}*/<<</*{grid_dim}*/, /*{block_dim}*/>>>(/*{arguments}*/);
+    checkErrorReturn(program_result, cudaPeekAtLastError());
+    checkErrorReturn(program_result, cudaThreadSynchronize());
+    timeReportMeasure(program_result, kernel);

data/lib/resources/cuda/memcpy_device_to_host.cpp ADDED

@@ -0,0 +1,3 @@
+    timeStartMeasure();
+    checkErrorReturn(program_result, cudaMemcpy(/*{host_name}*/, /*{device_name}*/, /*{bytes}*/, cudaMemcpyDeviceToHost));
+    timeReportMeasure(program_result, transfer_memory);

data/lib/resources/cuda/memcpy_device_to_host_expr.cpp ADDED

@@ -0,0 +1,10 @@
+({
+    variable_size_array_t device_array = /*{device_array}*/;
+    /*{type}*/ * tmp_result = (/*{type}*/ *) malloc(sizeof(/*{type}*/) * device_array.size);
+    timeStartMeasure();
+    checkErrorReturn(program_result, cudaMemcpy(tmp_result, device_array.content, sizeof(/*{type}*/) * device_array.size, cudaMemcpyDeviceToHost));
+    timeReportMeasure(program_result, transfer_memory);
+    variable_size_array_t((void *) tmp_result, device_array.size);
+})

data/lib/resources/cuda/reduce_body.cpp ADDED

@@ -0,0 +1,88 @@
+        int thread_idx = threadIdx.x;
+        // Single result of this block
+        /*{type}*/ /*{temp_result}*/;
+        int num_args = 2 * /*{block_size}*/;
+        if (blockIdx.x == gridDim.x - 1)
+        {
+            // Processing the last block, which might be odd (number of elements to reduce).
+            // Other blocks cannot be "odd", because every block reduces 2*block_size many elements.
+            // Number of elements to reduce in the last block
+            num_args = ((2 * /*{num_threads}*/ - 1) % (2 * /*{block_size}*/)) + (/*{odd}*/ ? 0 : 1);
+        }
+        if (num_args == 1)
+        {
+            /*{temp_result}*/ = /*{previous_result}*/[_tid_];
+        }
+        else if (num_args == 2)
+        {
+            /*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
+        }
+        else
+        {
+            // Allocate block_size many slots to contain the result of up to block_size many reductions, i.e.,
+            // this array contains the reduction of (up to) 2*block_size many elements.
+            __shared__ /*{type}*/ sdata[/*{block_size}*/];
+            /*{odd}*/ = num_args % 2 == 1;
+            // --- FIRST REDUCTION ---  Load from global memory
+            // Number of elements after the first reduction
+            num_args = num_args / 2 + num_args % 2;
+            if (thread_idx == num_args - 1 && /*{odd}*/)
+            {
+                // This is the last thread, and it should reduce only one element.
+                sdata[thread_idx] = /*{previous_result}*/[_tid_];
+            }
+            else
+            {
+                sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, /*{previous_result}*/[_tid_], /*{previous_result}*/[_tid_ + /*{num_threads}*/]);
+            }
+            __syncthreads();
+            // --- SUBSEQUENT REDUCTION ---  Read from shared memory only
+            /*{odd}*/ = num_args % 2 == 1;
+            for (
+                num_args = num_args / 2 + num_args % 2;             // Number of elements after this reduction
+                num_args > 1;                                       // ... as long as there's at least 3 elements left
+                num_args = num_args / 2 + num_args % 2) {
+                if (thread_idx < num_args) {
+                    // This thread has work to do...
+                    if (thread_idx != num_args - 1 || !/*{odd}*/)
+                    {
+                        sdata[thread_idx] = /*{block_name}*/(/*{arguments}*/, sdata[thread_idx], sdata[thread_idx + num_args]);
+                    }
+                    else
+                    {
+                        // This is the last element and it is odd, do nothing
+                    }
+                }
+                __syncthreads();
+                /*{odd}*/ = num_args % 2 == 1;
+            }
+            if (thread_idx == 0)
+            {
+                // Last thread returns result
+                /*{temp_result}*/ = /*{block_name}*/(/*{arguments}*/, sdata[0], sdata[1]);
+            }
+        }
+        // Write result to different position
+        _tid_ = blockIdx.x;
+        if (thread_idx != 0) {
+            // Only one thread should report the result
+            return;
+        }

data/lib/resources/cuda/stencil_array_reconstruction.cpp ADDED

	@@ -0,0 +1,2 @@
1	+ // (Re)construct array from separately passed parameters
2	+ /{type}/ /{name}/[] = /{initializer}/;

data/lib/resources/cuda/stencil_body.cpp ADDED

@@ -0,0 +1,16 @@
+    /*{result_type}*/ /*{temp_var}*/;
+    // Indices for all dimensions
+    /*{compute_indices}*/
+    if (/*{out_of_bounds_check}*/)
+    {
+        // All value indices within bounds
+        /*{execution}*/
+        /*{temp_var}*/ = /*{stencil_computation}*/;
+    }
+    else
+    {
+        // At least one index is out of bounds
+        /*{temp_var}*/ = /*{out_of_bounds_fallback}*/;
+    }

data/lib/resources/cuda/struct_definition.cpp ADDED

@@ -0,0 +1,4 @@
+struct /*{name}*/
+{
+    /*{fields}*/
+};

data/lib/ruby_core/array.rb ADDED

@@ -0,0 +1,34 @@
+module Ikra
+    module RubyIntegration
+        ALL_ARRAY_TYPES = proc do |type|
+            type.is_a?(Types::ArrayType) && !type.is_a?(Types::LocationAwareArrayType)
+        end
+        LOCATION_AWARE_ARRAY_TYPE = proc do |type|
+            # TODO: Maybe there should be an automated transfer to host side here if necessary?
+            type.is_a?(Types::LocationAwareArrayType)
+        end
+        LOCATION_AWARE_ARRAY_ACCESS = proc do |receiver, method_name, args, translator, result_type|
+            recv = receiver.accept(translator.expression_translator)
+            inner_type = receiver.get_type.singleton_type.inner_type.to_c_type
+            index = args[0].accept(translator.expression_translator)
+            "((#{inner_type} *) #{recv}.content)[#{index}]"
+        end
+        INNER_TYPE = proc do |rcvr|
+            rcvr.inner_type
+        end
+        implement ALL_ARRAY_TYPES, :[], INNER_TYPE, 1, "#0[#I1]"
+        implement(
+            LOCATION_AWARE_ARRAY_TYPE,
+            :[],
+            INNER_TYPE,
+            1,
+            LOCATION_AWARE_ARRAY_ACCESS)
+    end
+end

data/lib/ruby_core/array_command.rb ADDED

@@ -0,0 +1,313 @@
+require_relative "../types/types/array_type.rb"
+require_relative "../ast/interpreter.rb"
+module Ikra
+    module RubyIntegration
+        # This visitor traverses the tree of symbolically executed parallel operations. It raises
+        # an exception, if an array command was generated by symbolic execution/interpretation of
+        # `send_node`.
+        class SymbolicCycleFinder < Symbolic::Visitor
+            def self.raise_on_cycle(command, send_node)
+                visitor = self.new(send_node)
+                command.accept(visitor)
+            end
+            def initialize(send_node)
+                @send_node = send_node
+            end
+            def visit_array_command(node)
+                if node.generator_node == @send_node
+                    raise CycleDetectedError.new(node: node)
+                else
+                    # No cycle found yet, check dependent computations
+                    super
+                end
+            end
+        end
+        class CycleDetectedError < RuntimeError
+            def initialize(node:)
+                @node = node
+            end
+        end
+        ALL_ARRAY_COMMAND_TYPES = proc do |type|
+            type.is_a?(Symbolic::ArrayCommand)
+        end
+        PMAP_TYPE = proc do |rcvr_type, *args_types, send_node:|
+            # TODO: Handle keyword arguments
+            # Ensure that there is no cycle here. "Cycle" means that the same AST send node
+            # was used earlier (i.e., in one of `rcvr_type`'s inputs/dependent computations).
+            # In that case we have to abort type inference here, because it would not terminate.
+            SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
+            more_kw_args = {}
+            if send_node.arguments.size == 1
+                if !send_node.arguments.first.is_a?(AST::HashNode)
+                    raise ArgumentError.new("If an argument is given, it must be a Hash of kwargs.")
+                end
+                # Pass kwargs separately
+                more_kw_args = AST::Interpreter.interpret(send_node.arguments.first)
+            end
+            rcvr_type.pmap(
+                ast: send_node.block_argument,
+                generator_node: send_node,
+                # TODO: Fix binding
+                command_binding: send_node.find_behavior_node.binding,
+                **more_kw_args).to_union_type
+        end
+        PZIP_TYPE = proc do |rcvr_type, *args_types, send_node:|
+            # TODO: Support multiple arguments for `pzip`
+            types = args_types[0].map do |sing_type|
+                raise AssertionError.new("Singleton type expected") if sing_type.is_union_type?
+                rcvr_type.pzip(sing_type, generator_node: send_node).to_union_type
+            end
+            types.reduce(Types::UnionType.new) do |acc, type|
+                acc.expand_return_type(type)
+            end
+        end
+        PSTENCIL_TYPE = proc do |rcvr_type, *args_types, send_node:|
+            # TODO: Handle keyword arguments
+            ruby_args = send_node.arguments.map do |node|
+                AST::Interpreter.interpret(node)
+            end
+            more_kw_args = {}
+            if args_types.size == 3
+                if !ruby_args.last.is_a?(Hash)
+                    raise ArgumentError.new("If 3 arguments are given, the last one must be a Hash of kwargs.")
+                end
+                # Pass kwargs separately
+                more_kw_args = ruby_args.pop
+            end
+            SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
+            rcvr_type.pstencil(
+                *ruby_args,
+                ast: send_node.block_argument,
+                generator_node: send_node,
+                # TODO: Fix binding
+                command_binding: send_node.find_behavior_node.binding,
+                **more_kw_args).to_union_type
+        end
+        PREDUCE_TYPE = proc do |rcvr_type, *args_types, send_node:|
+            # TODO: Handle keyword arguments
+            SymbolicCycleFinder.raise_on_cycle(rcvr_type, send_node)
+            rcvr_type.preduce(ast: send_node.block_argument, generator_node: send_node).to_union_type
+        end
+        LAUNCH_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
+            # The result type is the symbolically executed result of applying this
+            # parallel section. The result type is an ArrayCommand.
+            array_command = receiver.get_type.singleton_type
+            # Translate command
+            command_translator = translator.command_translator
+            command_translator.push_kernel_launcher
+            result = array_command.accept(command_translator)
+            kernel_launcher = command_translator.pop_kernel_launcher(result)
+            # Prepare kernel launchers for launch of `array_command`
+            command_translator.program_builder.prepare_additional_args_for_launch(array_command)
+            # Generate launch code for all kernels
+            launch_code = command_translator.program_builder.build_kernel_launchers
+            # Always return a device pointer. Only at the very end, we transfer data to the host.
+            result_expr = kernel_launcher.kernel_result_var_name
+            if Translator::ArrayCommandStructBuilder::RequireRuntimeSizeChecker.require_size_function?(array_command)
+                # Size is not statically known, take information from receiver.
+                # TODO: Code depends on template. `cmd` is defined in template.
+                result_size = "cmd->size()"
+            else
+                # Size is known statically
+                result_size = array_command.size.to_s
+            end
+            # Debug information
+            if array_command.generator_node != nil
+                debug_information = array_command.to_s + ": " + array_command.generator_node.to_s
+            else
+                debug_information = array_command.to_s
+            end
+            result = Translator.read_file(file_name: "host_section_launch_parallel_section.cpp", replacements: {
+                "debug_information" => debug_information,
+                "array_command" => receiver.accept(translator.expression_translator),
+                "array_command_type" => array_command.to_c_type,
+                "result_size" => result_size,
+                "kernel_invocation" => launch_code,
+                "kernel_result" => result_expr,
+                "free_memory" => command_translator.program_builder.build_memory_free_except_last})
+            # Clear kernel launchers. Otherwise, we might launch them again in a later, unrelated
+            # LAUNCH_KERNEL branch. This is because we reuse the same [ProgramBuilder] for an
+            # entire host section.
+            command_translator.program_builder.clear_kernel_launchers
+            # Build all array command structs for this command
+            command_translator.program_builder.add_array_command_struct(
+                *Translator::ArrayCommandStructBuilder.build_all_structs(array_command))
+            result
+        end
+        ARRAY_COMMAND_TO_ARRAY_TYPE = proc do |rcvr_type, *args_types, send_node:|
+            Types::LocationAwareFixedSizeArrayType.new(
+                rcvr_type.result_type,
+                rcvr_type.dimensions,
+                location: :device).to_union_type
+        end
+        SYMBOLICALLY_EXECUTE_KERNEL = proc do |receiver, method_name, arguments, translator, result_type|
+            if !result_type.is_singleton?
+                raise AssertionError.new("Singleton type expected")
+            end
+            # Build arguments to constructor. First one (result field) is NULL.
+            constructor_args = ["NULL"]
+            # Translate all inputs (receiver, then arguments to parallel section)
+            constructor_args.push(receiver.accept(translator.expression_translator))
+            for arg in arguments
+                if arg.get_type.is_singleton? &&
+                    arg.get_type.singleton_type.is_a?(Symbolic::ArrayCommand)
+                    # Only ArrayCommands should show up as arguments
+                    constructor_args.push(arg.accept(translator.expression_translator))
+                end
+            end
+            all_args = constructor_args.join(", ")
+            # This is a hack because the type is a pointer type
+            "new #{result_type.singleton_type.to_c_type[0...-2]}(#{all_args})"
+        end
+        ALL_LOCATION_AWARE_ARRAY_TYPES = proc do |type|
+            type.is_a?(Types::LocationAwareArrayType)
+        end
+        LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE = proc do |rcvr_type, *args_types|
+            # TODO: Should also be able to handle variable variant
+            Types::LocationAwareFixedSizeArrayType.new(
+                rcvr_type.inner_type,
+                rcvr_type.dimensions,
+                location: :host).to_union_type
+        end
+        LOCATION_AWARE_ARRAY_CALL_TYPE = proc do |rcvr_type, *args_types|
+            # Calling `__call__` on an array does not do anything
+            rcvr_type.to_union_type
+        end
+        COPY_ARRAY_TO_HOST = proc do |receiver, method_name, args, translator, result_type|
+            if receiver.get_type.singleton_type.location == :host
+                receiver.accept(translator.expression_translator)
+            else
+                c_type = receiver.get_type.singleton_type.inner_type.to_c_type
+                Translator.read_file(file_name: "memcpy_device_to_host_expr.cpp", replacements: {
+                    "type" => c_type,
+                    "device_array" => receiver.accept(translator.expression_translator)})
+            end
+        end
+        ARRAY_TYPE_TO_COMMAND_TYPE = proc do |rcvr_type, *args_types, send_node:|
+            rcvr_type.to_command.to_union_type
+        end
+        FREE_MEMORY_FOR_ARRAY_COMMAND = proc do |receiver, method_name, args, translator, result_type|
+            Translator.read_file(file_name: "free_memory_for_command.cpp", replacements: {
+                "type" => receiver.get_type.to_c_type,
+                "receiver" => receiver.accept(translator.expression_translator)})
+        end
+        # Manually free memory
+        # TODO: Implement escape analysis and try to reuse memory
+        implement(
+            ALL_ARRAY_COMMAND_TYPES,
+            :free_memory,
+            BOOL,
+            0,
+            FREE_MEMORY_FOR_ARRAY_COMMAND)
+        # Implement all parallel operations
+        implement(
+            ALL_ARRAY_COMMAND_TYPES,
+            :pmap,
+            PMAP_TYPE,
+            0..1,
+            SYMBOLICALLY_EXECUTE_KERNEL)
+        implement(
+            ALL_LOCATION_AWARE_ARRAY_TYPES,
+            :to_command,
+            ARRAY_TYPE_TO_COMMAND_TYPE,
+            0,
+            SYMBOLICALLY_EXECUTE_KERNEL)
+        implement(
+            ALL_ARRAY_COMMAND_TYPES,
+            :pzip,
+            PZIP_TYPE,
+            1,
+            SYMBOLICALLY_EXECUTE_KERNEL,
+            expect_singleton_args: true)
+        implement(
+            ALL_ARRAY_COMMAND_TYPES,
+            :pstencil,
+            PSTENCIL_TYPE,
+            2..3,      # neighborhood and default value, maybe hash
+            SYMBOLICALLY_EXECUTE_KERNEL)
+        implement(
+            ALL_ARRAY_COMMAND_TYPES,
+            :preduce,
+            PREDUCE_TYPE,
+            0,
+            SYMBOLICALLY_EXECUTE_KERNEL)
+        implement(
+            ALL_ARRAY_COMMAND_TYPES,
+            :__call__,
+            ARRAY_COMMAND_TO_ARRAY_TYPE,
+            0,
+            LAUNCH_KERNEL)
+        implement(
+            ALL_LOCATION_AWARE_ARRAY_TYPES,
+            :__to_host_array__,
+            LOCATION_AWARE_ARRAY_TO_HOST_ARRAY_TYPE,
+            0,
+            COPY_ARRAY_TO_HOST)
+        implement(
+            ALL_LOCATION_AWARE_ARRAY_TYPES,
+            :__call__,
+            LOCATION_AWARE_ARRAY_CALL_TYPE,
+            0,
+            "#0")
+    end
+end