PyPI - specula - Versions diffs - 0.0.0__py3-none-any.whl - Mend

specula 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

specula/__init__.py +282 -0
specula/base_data_obj.py +135 -0
specula/base_processing_obj.py +267 -0
specula/base_time_obj.py +115 -0
specula/base_value.py +75 -0
specula/calib_manager.py +146 -0
specula/connections.py +151 -0
specula/data_objects/__init__.py +0 -0
specula/data_objects/convolution_kernel.py +427 -0
specula/data_objects/electric_field.py +338 -0
specula/data_objects/gaussian_convolution_kernel.py +108 -0
specula/data_objects/ifunc.py +210 -0
specula/data_objects/ifunc_inv.py +100 -0
specula/data_objects/iir_filter_data.py +1191 -0
specula/data_objects/infinite_phase_screen.py +233 -0
specula/data_objects/intensity.py +75 -0
specula/data_objects/intmat.py +265 -0
specula/data_objects/laser_launch_telescope.py +105 -0
specula/data_objects/layer.py +104 -0
specula/data_objects/lenslet.py +76 -0
specula/data_objects/m2c.py +94 -0
specula/data_objects/pixels.py +135 -0
specula/data_objects/pupdata.py +144 -0
specula/data_objects/pupilstop.py +139 -0
specula/data_objects/recmat.py +100 -0
specula/data_objects/simul_params.py +52 -0
specula/data_objects/slopes.py +280 -0
specula/data_objects/source.py +182 -0
specula/data_objects/subap_data.py +83 -0
specula/data_objects/time_history.py +54 -0
specula/field_analyser.py +670 -0
specula/loop_control.py +178 -0
specula/processing_objects/__init__.py +0 -0
specula/processing_objects/atmo_evolution.py +290 -0
specula/processing_objects/atmo_infinite_evolution.py +248 -0
specula/processing_objects/atmo_propagation.py +285 -0
specula/processing_objects/atmo_random_phase.py +152 -0
specula/processing_objects/avc.py +16 -0
specula/processing_objects/base_generator.py +62 -0
specula/processing_objects/base_operation.py +141 -0
specula/processing_objects/base_slicer.py +40 -0
specula/processing_objects/ccd.py +249 -0
specula/processing_objects/data_buffer.py +74 -0
specula/processing_objects/data_print.py +92 -0
specula/processing_objects/data_source.py +77 -0
specula/processing_objects/data_store.py +148 -0
specula/processing_objects/demodulator.py +188 -0
specula/processing_objects/display_server.py +347 -0
specula/processing_objects/distributed_sh.py +137 -0
specula/processing_objects/dm.py +142 -0
specula/processing_objects/double_roof_slopec.py +44 -0
specula/processing_objects/electric_field_combinator.py +63 -0
specula/processing_objects/electric_field_reflection.py +37 -0
specula/processing_objects/ext_source_pyramid.py +169 -0
specula/processing_objects/extended_source.py +722 -0
specula/processing_objects/focal_plane_filter.py +263 -0
specula/processing_objects/gain_optimizer.py +386 -0
specula/processing_objects/ideal_derivative_sensor.py +266 -0
specula/processing_objects/iir_filter.py +197 -0
specula/processing_objects/im_calibrator.py +194 -0
specula/processing_objects/integrator.py +52 -0
specula/processing_objects/linear_combination.py +103 -0
specula/processing_objects/low_pass_filter.py +32 -0
specula/processing_objects/mirror_commands_combinator.py +99 -0
specula/processing_objects/modal_analysis.py +165 -0
specula/processing_objects/modalrec.py +253 -0
specula/processing_objects/modalrec_implicit_polc.py +110 -0
specula/processing_objects/modulated_double_roof.py +219 -0
specula/processing_objects/modulated_pyramid.py +658 -0
specula/processing_objects/multi_im_calibrator.py +159 -0
specula/processing_objects/multi_rec_calibrator.py +83 -0
specula/processing_objects/mvm.py +71 -0
specula/processing_objects/optical_gain_estimator.py +130 -0
specula/processing_objects/phase_flattening.py +63 -0
specula/processing_objects/poly_chrom_sh.py +68 -0
specula/processing_objects/poly_chrom_wfs.py +182 -0
specula/processing_objects/poly_crom_pyramid.py +82 -0
specula/processing_objects/power_loss.py +44 -0
specula/processing_objects/psf.py +119 -0
specula/processing_objects/psf_coronagraph.py +160 -0
specula/processing_objects/push_pull_generator.py +67 -0
specula/processing_objects/pyr_pupdata_calibrator.py +346 -0
specula/processing_objects/pyr_slopec.py +132 -0
specula/processing_objects/random_generator.py +64 -0
specula/processing_objects/rec_calibrator.py +86 -0
specula/processing_objects/schedule_generator.py +52 -0
specula/processing_objects/sh.py +557 -0
specula/processing_objects/sh_slopec.py +314 -0
specula/processing_objects/sh_subap_calibrator.py +110 -0
specula/processing_objects/slopec.py +132 -0
specula/processing_objects/sn_calibrator.py +53 -0
specula/processing_objects/spot_monitor.py +263 -0
specula/processing_objects/time_history_generator.py +31 -0
specula/processing_objects/vibration_generator.py +135 -0
specula/processing_objects/wave_generator.py +75 -0
specula/processing_objects/windowed_integration.py +55 -0
specula/processing_objects/zernike_sensor.py +111 -0
specula/simul.py +1020 -0
specula/template_processing_obj.py +85 -0
specula-0.0.0.dist-info/METADATA +86 -0
specula-0.0.0.dist-info/RECORD +105 -0
specula-0.0.0.dist-info/WHEEL +5 -0
specula-0.0.0.dist-info/entry_points.txt +4 -0
specula-0.0.0.dist-info/licenses/LICENSE +21 -0
specula-0.0.0.dist-info/top_level.txt +1 -0

specula/__init__.py ADDED Viewed

@@ -0,0 +1,282 @@
+import numpy as np
+import os
+import functools
+from functools import wraps
+cpu_float_dtype_list = [np.float64, np.float32]
+cpu_complex_dtype_list = [np.complex128, np.complex64]
+array_types = []
+gpuEnabled = False
+cp = None
+xp = None
+global_precision = None
+float_dtype_list = None
+complex_dtype_list = None
+gpu_float_dtype_list = cpu_float_dtype_list
+gpu_complex_dtype_list = cpu_complex_dtype_list
+float_dtype = None
+complex_dtype = None
+default_target_device_idx = None
+default_target_device = None
+process_comm = None
+process_rank = None
+ASEC2RAD = np.pi / (3600 * 180)
+RAD2ASEC = 1.0 / ASEC2RAD
+MPI_DBG = False
+MPI_SEND_DBG = False
+# precision = 0 -> double precision
+# precision = 1 -> single precision
+# target_device = -1 -> CPU
+# target_device = i>-1 -> GPUi
+# you might have a GPU working and cupy installed
+# and still want to use the CPU (idx==-1) as default_target_device
+# in this case you might still want to allocate some objects on
+# a GPU device (idx>=0).
+# This can be checked later looking at the  value of gpuEnabled.
+def init(device_idx=-1, precision=0, rank=None, comm=None, mpi_dbg=False):
+    global xp
+    global cp
+    global gpuEnabled
+    global global_precision
+    global float_dtype_list
+    global complex_dtype_list
+    global gpu_float_dtype_list
+    global gpu_complex_dtype_list
+    global array_types
+    global float_dtype
+    global complex_dtype
+    global default_target_device_idx
+    global default_target_device
+    global process_comm
+    global process_rank
+    global MPI_DBG
+    MPI_DBG = mpi_dbg
+    process_comm = comm
+    process_rank = rank
+    default_target_device_idx = device_idx
+    systemDisable = os.environ.get('SPECULA_DISABLE_GPU', 'FALSE')
+    if systemDisable=='FALSE':
+        try:
+            import cupy as cp
+            print("Cupy import successfull. Installed version is:", cp.__version__)
+            gpuEnabled = True
+            cp = cp
+        except:
+            print("Cupy import failed. SPECULA will fall back to CPU use.")
+            cp = None
+            xp = np
+            default_target_device_idx=-1
+    else:
+        print("env variable SPECULA_DISABLE_GPU prevents using the GPU.")
+        cp = None
+        xp = np
+        default_target_device_idx=-1
+    if default_target_device_idx>=0:
+        xp = cp
+        gpu_float_dtype_list = [cp.float64, cp.float32]
+        gpu_complex_dtype_list = [cp.complex128, cp.complex64]
+        default_target_device = cp.cuda.Device(default_target_device_idx)
+        default_target_device.use()
+        print('Default device is GPU number ', default_target_device_idx)
+        # print('Using device: ', cp.cuda.runtime.getDeviceProperties(default_target_device)['name'])
+        # attributes = default_target_device.attributes
+        # properties = cp.cuda.runtime.getDeviceProperties(default_target_device)
+        # print('Number of multiprocessors:', attributes['MultiProcessorCount'])
+        # print('Global memory size (GB):', properties['totalGlobalMem'] / (1024**3))
+    else:
+        print('Default device is CPU')
+        xp = np
+    if cp is not None:
+        array_types = [np.ndarray, cp.ndarray]
+    else:
+        array_types = [np.ndarray]
+    float_dtype_list = [xp.float64, xp.float32]
+    complex_dtype_list = [xp.complex128, xp.complex64]
+    global_precision = precision
+    float_dtype = float_dtype_list[global_precision]
+    complex_dtype = complex_dtype_list[global_precision]
+    # Patch cupy's missing RandomState.random() method
+    if cp is not None:
+        cp.random.RandomState.random = cp.random.RandomState.random_sample
+# should be used as less as a possible and preferably outside time critical computations
+def cpuArray(v, dtype=None, force_copy=False):
+    return to_xp(np, v, dtype=dtype, force_copy=force_copy)
+def to_xp(xp, v, dtype=None, force_copy=False):
+    '''
+    Make sure that v is allocated as an array on this object's device.
+    Works for all combinations of np and cp, whether installed or not.
+    Optionally casts to the required dtype (no copy is made if
+    the dtype is already the correct one)
+    The main trigger for this function is that np.array() cannot
+    be used on a cupy array.
+    '''
+    if xp is cp:
+        if isinstance(v, cp.ndarray) and not force_copy:
+            retval =  v
+        else:
+            retval =  cp.array(v)
+    else:
+        if cp is not None and isinstance(v, cp.ndarray):
+            retval = v.get()
+        elif isinstance(v, np.ndarray) and not force_copy:
+            # Avoid extra copy (enabled by numpy default)
+            retval = v
+        else:
+            retval = np.array(v)
+    if dtype is None and not force_copy:
+        return retval
+    else:
+        return retval.astype(dtype, copy=force_copy)
+class DummyDecoratorAndContextManager():
+    def __init__(self):
+        pass
+    def __enter__(self):
+        pass
+    def __exit__(self, *args):
+        pass
+    def __call__(self, f):
+        def caller(*args, **kwargs):
+            return f(*args, **kwargs)
+        return caller
+def show_in_profiler(message=None, color_id=None, argb_color=None, sync=False):
+    '''
+    Decorator to allow using cupy's TimeRangeDecorator
+    in a safe way even when cupy is not installed
+    Parameters are the same as TimeRangeDecorator
+    '''
+    try:
+        from cupyx.profiler import time_range
+        return time_range(message=message,
+                          color_id=color_id,
+                          argb_color=argb_color,
+                          sync=sync)
+    except ImportError:
+        return DummyDecoratorAndContextManager()
+def fuse(kernel_name=None):
+    '''
+    Replacement of cupy.fuse() allowing runtime
+    dispatch to cupy or numpy.
+    Fused function takes an xp argument that will
+    cause it to run as a fused kernel or a standard
+    numpy function. The xp argument can be used
+    inside the function as usual.
+    Parameters are the same as cp.fuse()
+    '''
+    def decorator(f):
+        f_cp = functools.partial(f, xp=cp)
+        f_np = functools.partial(f, xp=np)
+        f_cpu = f_np
+        if cp:
+            f_gpu = cp.fuse(kernel_name=kernel_name)(f_cp)
+        else:
+            f_gpu = None
+        @wraps(f)
+        def wrapper(*args, xp, **kwargs):
+            if xp == cp:
+                return f_gpu(*args, **kwargs)
+            else:
+                return f_cpu(*args, **kwargs)
+        return wrapper
+    return decorator
+def main_simul(yml_files: list,
+               nsimul = 1,
+               cpu: bool=False,
+               overrides: str=None,
+               target: int=0,
+               profile: bool=False,
+               mpi: bool=False,
+               mpidbg: bool=False,
+               stepping: bool=False,
+               diagram: bool=False,
+               diagram_title: str=None,
+               diagram_filename: str=None,
+               diagram_colors_on: bool=False):
+    if mpi:
+        try:
+            from mpi4py import MPI
+            from mpi4py.util import pkl5
+            print("mpi4py import successfull. Installed version is:", MPI.Get_version())
+        except ImportError:
+            print("mpi4py import failed.")
+            raise
+        comm = pkl5.Intracomm(MPI.COMM_WORLD)
+        rank = comm.Get_rank()
+        N = 10000000
+        datatype = MPI.FLOAT
+        num_bytes = N * (datatype.Pack_size(count=1, comm=comm) + MPI.BSEND_OVERHEAD)
+        print(f'MPI buffer size: {num_bytes/1024**2:.2f} MB')
+        attached_buf = bytearray(num_bytes)
+        MPI.Attach_buffer(attached_buf)
+    else:
+        rank = None
+        comm = None
+    if cpu:
+        target_device_idx = -1
+    else:
+        target_device_idx = target
+    init(target_device_idx, precision=1, rank=rank, comm=comm, mpi_dbg=mpidbg)
+    from specula.simul import Simul
+    if profile:
+        import cProfile
+        import pstats
+        pr = cProfile.Profile()
+        pr.enable()
+    for simul_idx in range(nsimul):
+        print(yml_files)
+        Simul(*yml_files,
+            simul_idx=simul_idx,
+            overrides=overrides,
+            stepping=stepping,
+            diagram=diagram,
+            diagram_filename=diagram_filename,
+            diagram_title=diagram_title,
+            diagram_colors_on=diagram_colors_on
+        ).run()
+    if profile:
+        pr.disable
+        stats = pstats.Stats(pr).sort_stats("cumtime")
+        stats.print_stats(r"\((?!\_).*\)$", 200)
+    if mpi:
+        MPI.Detach_buffer()

specula/base_data_obj.py ADDED Viewed

@@ -0,0 +1,135 @@
+import warnings
+from copy import copy
+from functools import lru_cache
+from specula import cp, np, array_types
+from specula.base_time_obj import BaseTimeObj
+# We use lru_cache() instead of cache() for python 3.8 compatibility
+@lru_cache(maxsize=None)
+def get_properties(cls):
+    result = []
+    classlist = cls.__mro__
+    for cc in classlist:
+        result.extend([attr for attr, value in vars(cc).items() if isinstance(value, property) ])
+    return result
+class BaseDataObj(BaseTimeObj):
+    def __init__(self, target_device_idx=None, precision=None):
+        """
+        Initialize the base data object.
+        Parameters:
+        target_device_idx: int, optional
+            device to be targeted for data storage. Set to -1 for CPU,
+            to 0 for the first GPU device, 1 for the second GPU device, etc.
+        precision: int, optional
+            if None will use the global_precision, otherwise set to 0 for double, 1 for single
+        """
+        super().__init__(target_device_idx, precision)
+        self.generation_time = -1
+        self.tag = ''
+    def transferDataTo(self, destobj, force_reallocation=False):
+        '''
+        Copy CPU/GPU arrays into an existing data object:
+        iterate over all self attributes and, if a CPU or GPU array
+        is detected, copy data into *destobj* without reallocating.
+        Destination (CPU or GPU device) is inferred by *destobj.target_device_idx*,
+        which must be set correctly before calling this method.
+        '''
+        # Get a list of all attributes, but skip properties
+        pp = get_properties(type(self))
+        attr_list = [attr for attr in dir(self) if attr not in pp]
+        for attr in attr_list:
+            self_attr = getattr(self, attr)
+            self_type = type(self_attr)
+            if self_type not in array_types:
+                continue
+            dest_attr = getattr(destobj, attr)
+            dest_type = type(dest_attr)
+            if dest_type not in array_types:
+                print(f'Warning: destination attribute is not a cupy/numpy array, forcing reallocation ({destobj}.{attr})')
+                force_reallocation = True
+            # Destination array had the correct type: perform in-place data copy
+            if not force_reallocation:
+                # Detect whether the array types are correct for all four cases:
+                # Device to CPU, CPU to device, device-to-device, and CPU-CPU. Also check whether
+                # the target_device_idx is set correctly for the destination object.
+                DtD = cp is not None and (self_type == cp.ndarray) and (dest_type == cp.ndarray) and destobj.target_device_idx >= 0
+                DtH = cp is not None and (self_type == cp.ndarray) and (dest_type == np.ndarray) and destobj.target_device_idx == -1
+                HtD = cp is not None and (self_type == np.ndarray) and (dest_type == cp.ndarray) and destobj.target_device_idx >= 0
+                HtH = (self_type == np.ndarray) and (dest_type == np.ndarray) and destobj.target_device_idx == -1
+                if DtD:
+                    # Performance warnings here are expected, because we might
+                    # trigger a peer-to-peer transfer between devices
+                    with warnings.catch_warnings():
+                        if self.PerformanceWarning:
+                            warnings.simplefilter("ignore", category=self.PerformanceWarning)
+                        try:
+                            dest_attr[:] = self_attr
+                        except:
+                            dest_attr = self_attr
+                elif DtH:
+                    # Do not set blocking=True for cupy 12.x compatibility.
+                    # Blocking is True by default in later versions anyway
+                    self_attr.get(out=dest_attr)
+                elif HtD:
+                    dest_attr.set(self_attr)
+                elif HtH:
+                    dest_attr[:] = self_attr
+                else:
+                    print(f'Warning: mismatch between target_device_idx and array allocation, forcing reallocation ({destobj}.{attr})')
+                    force_reallocation = True
+            # Otherwise, reallocate
+            if force_reallocation:
+                DtD = cp is not None and (self_type == cp.ndarray) and destobj.target_device_idx >= 0
+                DtH = cp is not None and (self_type == cp.ndarray) and destobj.target_device_idx == -1
+                HtD = (self_type == np.ndarray) and destobj.target_device_idx >= 0
+                HtH = (self_type == np.ndarray) and destobj.target_device_idx == -1
+                if DtD:
+                    # Performance warnings here are expected, because we might
+                    # trigger a peer-to-peer transfer between devices
+                    with warnings.catch_warnings():
+                        if self.PerformanceWarning:
+                            warnings.simplefilter("ignore", category=self.PerformanceWarning)
+                        setattr(destobj, attr, cp.asarray(self_attr))
+                if DtH:
+                    # Do not set blocking=True for cupy 12.x compatibility.
+                    # Blocking is True by default in later versions anyway
+                    setattr(destobj, attr, self_attr.get())
+                if HtD:
+                    setattr(destobj, attr, cp.asarray(self_attr))
+                if HtH:
+                    setattr(destobj, attr, np.asarray(self_attr))
+        destobj.generation_time = self.generation_time
+    def copyTo(self, target_device_idx):
+        '''
+        Duplicate a data object on another device,
+        alllocating all CPU/GPU arrays on the new device.
+        '''
+        if target_device_idx == self.target_device_idx:
+            return self
+        else:
+            cloned = copy(self)
+            if target_device_idx >= 0:
+                cloned.xp = cp
+            else:
+                cloned.xp = np
+            cloned.target_device_idx = target_device_idx
+            self.transferDataTo(cloned, force_reallocation=True)
+            return cloned

specula/base_processing_obj.py ADDED Viewed

@@ -0,0 +1,267 @@
+from collections import defaultdict
+from specula import cpuArray, default_target_device, cp, MPI_DBG, MPI_SEND_DBG
+from specula import show_in_profiler
+from specula import process_comm, process_rank
+from specula.base_time_obj import BaseTimeObj
+from specula.data_objects.layer import Layer
+class BaseProcessingObj(BaseTimeObj):
+    _streams = {}
+    def __init__(self, target_device_idx=None, precision=None):
+        """
+        Initialize the base processing object.
+        Parameters:
+        precision (int, optional): if None will use the global_precision, otherwise pass 0 for double, 1 for single
+        target_device_idx (int, optional): if None will use the default_target_device_idx, otherwise pass -1 for cpu, i for GPU of index i
+        """
+        BaseTimeObj.__init__(self, target_device_idx=target_device_idx, precision=precision)
+        self.current_time = 0
+        self.current_time_seconds = 0
+        self.verbose = 0
+        # Stream/input management
+        self.stream  = None
+        self.inputs_changed = False
+        self.cuda_graph = None
+        # Will be populated by derived class
+        self.inputs = {}
+        self.local_inputs = {}
+        self.outputs = {}
+        self.remote_outputs = defaultdict(list)
+        self.sent_valid = {}
+        # Use the correct CUDA device for allocations in derived classes'  __init__
+        if self.target_device_idx >= 0:
+            self._target_device.use()
+        # Default name is none is given externally
+        self.name = self.__class__.__name__
+    # Use the correct CUDA device for allocations in derived classes' prepare_trigger()
+    def prepare_trigger(self, t):
+        self.current_time_seconds = self.t_to_seconds(self.current_time)
+        if self.target_device_idx >= 0:
+            self._target_device.use()
+    def addRemoteOutput(self, name, remote_output):
+        self.remote_outputs[name].append(remote_output)
+    def checkInputTimes(self):
+        if len(self.inputs)==0:
+            return True
+        self.get_all_inputs()
+        for input_name, input_obj in self.local_inputs.items():
+            if type(input_obj) is not list:
+                input_obj = [input_obj]
+            tt_list = [x.generation_time for x in input_obj if x is not None]
+            for tt in tt_list:
+                if tt is not None and tt >= self.current_time:
+                    return True
+        else:
+            return False
+    def get_all_inputs(self):
+        '''
+        Perform get() on all inputs.
+        Remote inputs, if any, are received via MPI.
+        Data is transferred between devices if necessary.
+        '''
+        for input_name, input_obj in self.inputs.items():
+            if MPI_DBG: print(process_rank, 'get_all_inputs(): getting InputValue:',
+                              input_name, flush=True)
+            # Set additional info for better error messages
+            input_obj.requesting_obj_name = self.name
+            input_obj.input_name = input_name
+            self.local_inputs[input_name] = input_obj.get(self.target_device_idx)
+        if MPI_DBG:
+            print(process_rank, self.name, 'My inputs are:')
+            for in_name, in_value in self.local_inputs.items():
+                if type(in_value) is list:
+                    if len(in_value) > 0 and type(in_value[0]) is Layer:
+                        print(process_rank, in_name,
+                              [(x.generation_time, x.phaseInNm) for x in in_value],
+                              flush=True)
+                    else:
+                        print(process_rank, in_name,
+                              [(x.generation_time, x) for x in in_value],
+                              flush=True)
+                else:
+                    print(process_rank, in_name,
+                          in_value.generation_time if in_value is not None else None,
+                          in_value, type(in_value), flush=True)
+    def trigger_code(self):
+        '''
+        Any code implemented by derived classes must:
+        1) only perform GPU operations using the xp module
+           on arrays allocated with self.xp
+        2) avoid any explicity numpy or normal python operation.
+        3) NOT use any value in variables that are reallocated by prepare_trigger() or post_trigger(),
+           and in general avoid any value defined outside this class (like object inputs)
+        because if stream capture is used, a CUDA graph will be generated that will skip
+        over any non-GPU operation and re-use GPU memory addresses of its first run.
+        Defining local variables inside this function is OK, they will persist in GPU memory.
+        '''
+        pass
+    def post_trigger(self):
+        '''
+        Make sure we are using the correct device and that any previous
+        CUDA graph has been synchronized
+        '''
+        # Double check that we can execute
+        if not self.inputs_changed:
+            raise RuntimeError("trigger() called when the object's inputs have not changed")
+        # Reset inputs flag
+        self.inputs_changed = False
+        if self.target_device_idx>=0:
+            self._target_device.use()
+            if self.cuda_graph:
+                self.stream.synchronize()
+    def send_remote_output(self, item, dest_rank, dest_tag, first_mpi_send=True, out_name=''):
+        if MPI_SEND_DBG: print(process_rank, f'SEND to rank {dest_rank} {dest_tag=} {(dest_tag in self.sent_valid)=} (from {self.name}.{out_name})', flush=True)
+        if first_mpi_send or not dest_tag in self.sent_valid:
+            if MPI_SEND_DBG: print(process_rank, 'SEND with Pickle', dest_tag, flush=True)
+            xp_orig = item.xp
+            item.xp = 0
+            process_comm.ibsend(item, dest=dest_rank, tag=dest_tag)
+            item.xp = xp_orig
+        else:
+            buffer = item.get_value()
+            if MPI_SEND_DBG:  print(process_rank, dest_tag, 'SEND .device', buffer.device)
+            if MPI_SEND_DBG: print(process_rank, 'SEND with Buffer', dest_tag, type(buffer), buffer, flush=True)
+            if MPI_SEND_DBG: print(process_rank, 'SEND with Buffer type', dest_tag, buffer.dtype, flush=True)
+            process_comm.Ibsend(cpuArray(buffer), dest=dest_rank, tag=dest_tag)
+            process_comm.ibsend(item.generation_time, dest=dest_rank, tag=dest_tag+1)
+        if item.get_value() is not None:
+            self.sent_valid[dest_tag] = True
+    # this method implements the mpi send call of the outputs connected to remote inputs
+    def send_outputs(self, skip_delayed=False, delayed_only=False, first_mpi_send=True):
+        '''
+        Send all remote outputs via MPI.
+        If *skip_delayed* is True, skip sending all delayed outputs.
+            Used during the last iteration when the simulation is ending and
+            no one would receive the delayed inputs.
+        If *delayed_only* is True, only send the delayed outputs.
+            Used while setting up the simulation, to initialize outputs
+            that are delayed and thus would not be received otherwise.
+        '''
+        if MPI_DBG:
+            print(process_rank, self.name, 'My outputs are:')
+            for out_name, out_value in self.outputs.items():
+                print(process_rank, out_name, out_value, flush=True)
+        if MPI_DBG: print(process_rank, 'send_outputs', flush=True)
+        for out_name, remote_specs in self.remote_outputs.items():
+            for remote_spec in remote_specs:
+                dest_rank, dest_tag, delay = remote_spec
+                # avoid sending outputs that will not be received
+                # because the simulation is ending
+                if delay < 0 and skip_delayed:
+                    if MPI_SEND_DBG: print(process_rank, f'SKIPPED SEND to rank {dest_rank} {dest_tag=} due to delay={delay}', flush=True)
+                    continue
+                if delay >= 0 and delayed_only:
+                    if MPI_SEND_DBG: print(process_rank, f'SKIPPED SEND to rank {dest_rank} {dest_tag=} due to delay={delay}', flush=True)
+                    continue
+                if MPI_DBG: print(process_rank, 'Sending ', out_name, 'to ', dest_rank, 'with tag',  dest_tag, type(self.outputs[out_name]), flush=True)
+                # workaround because module objects cannot be pickled
+                for item in self.outputs[out_name] if isinstance(self.outputs[out_name], list) else [self.outputs[out_name]]:
+                    self.send_remote_output(item, dest_rank, dest_tag, first_mpi_send, out_name)
+    @classmethod
+    def device_stream(cls, target_device_idx):
+        if not target_device_idx in cls._streams:
+            cls._streams[target_device_idx] = cp.cuda.Stream(non_blocking=False)
+        return cls._streams[target_device_idx]
+    def build_stream(self, allow_parallel=True):
+        if self.target_device_idx>=0:
+            self._target_device.use()
+            if allow_parallel:
+                self.stream = cp.cuda.Stream(non_blocking=False)
+            else:
+                self.stream = self.device_stream(self.target_device_idx)
+            self.capture_stream()
+            default_target_device.use()
+    def capture_stream(self):
+        with self.stream:
+            # First execution is needed to build the FFT plan cache
+            # See for example https://github.com/cupy/cupy/issues/7559
+            self.trigger_code()
+            self.stream.begin_capture()
+            self.trigger_code()
+            self.cuda_graph = self.stream.end_capture()
+    def check_ready(self, t):
+        self.current_time = t
+        if self.target_device_idx >= 0:
+            self._target_device.use()
+        if self.checkInputTimes():
+            self.inputs_changed = True  # Signal ready for trigger and post_trigger()
+            self.prepare_trigger(t)
+        else:
+            self.inputs_changed = False
+            if self.verbose:
+                print('No inputs have been refreshed, skipping trigger')
+        return self.inputs_changed
+    def trigger(self):
+        # Double check that we can execute
+        if not self.inputs_changed:
+            raise RuntimeError("trigger() called when the object's inputs have not changed")
+        with show_in_profiler(self.__class__.__name__+'.trigger'):
+            if self.target_device_idx>=0:
+                self._target_device.use()
+            if self.target_device_idx>=0 and self.cuda_graph:
+                self.cuda_graph.launch(stream=self.stream)
+            else:
+                self.trigger_code()
+    def setup(self):
+        """
+        Override this method to perform any setup
+        just before the simulation is started.
+        The base class implementation also checks that
+        all non-optional inputs have been set.
+        """
+        if self.target_device_idx >= 0:
+            self._target_device.use()
+        self.get_all_inputs()
+        for input_name, input in self.inputs.items():
+            if self.local_inputs[input_name] is None and not input.optional:
+                raise ValueError(f'Input {input_name} for object {self} has not been set')
+    def finalize(self):
+        '''
+        Override this method to perform any actions after
+        the simulation is completed
+        '''
+        pass