specula 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. specula/__init__.py +282 -0
  2. specula/base_data_obj.py +135 -0
  3. specula/base_processing_obj.py +267 -0
  4. specula/base_time_obj.py +115 -0
  5. specula/base_value.py +75 -0
  6. specula/calib_manager.py +146 -0
  7. specula/connections.py +151 -0
  8. specula/data_objects/__init__.py +0 -0
  9. specula/data_objects/convolution_kernel.py +427 -0
  10. specula/data_objects/electric_field.py +338 -0
  11. specula/data_objects/gaussian_convolution_kernel.py +108 -0
  12. specula/data_objects/ifunc.py +210 -0
  13. specula/data_objects/ifunc_inv.py +100 -0
  14. specula/data_objects/iir_filter_data.py +1191 -0
  15. specula/data_objects/infinite_phase_screen.py +233 -0
  16. specula/data_objects/intensity.py +75 -0
  17. specula/data_objects/intmat.py +265 -0
  18. specula/data_objects/laser_launch_telescope.py +105 -0
  19. specula/data_objects/layer.py +104 -0
  20. specula/data_objects/lenslet.py +76 -0
  21. specula/data_objects/m2c.py +94 -0
  22. specula/data_objects/pixels.py +135 -0
  23. specula/data_objects/pupdata.py +144 -0
  24. specula/data_objects/pupilstop.py +139 -0
  25. specula/data_objects/recmat.py +100 -0
  26. specula/data_objects/simul_params.py +52 -0
  27. specula/data_objects/slopes.py +280 -0
  28. specula/data_objects/source.py +182 -0
  29. specula/data_objects/subap_data.py +83 -0
  30. specula/data_objects/time_history.py +54 -0
  31. specula/field_analyser.py +670 -0
  32. specula/loop_control.py +178 -0
  33. specula/processing_objects/__init__.py +0 -0
  34. specula/processing_objects/atmo_evolution.py +290 -0
  35. specula/processing_objects/atmo_infinite_evolution.py +248 -0
  36. specula/processing_objects/atmo_propagation.py +285 -0
  37. specula/processing_objects/atmo_random_phase.py +152 -0
  38. specula/processing_objects/avc.py +16 -0
  39. specula/processing_objects/base_generator.py +62 -0
  40. specula/processing_objects/base_operation.py +141 -0
  41. specula/processing_objects/base_slicer.py +40 -0
  42. specula/processing_objects/ccd.py +249 -0
  43. specula/processing_objects/data_buffer.py +74 -0
  44. specula/processing_objects/data_print.py +92 -0
  45. specula/processing_objects/data_source.py +77 -0
  46. specula/processing_objects/data_store.py +148 -0
  47. specula/processing_objects/demodulator.py +188 -0
  48. specula/processing_objects/display_server.py +347 -0
  49. specula/processing_objects/distributed_sh.py +137 -0
  50. specula/processing_objects/dm.py +142 -0
  51. specula/processing_objects/double_roof_slopec.py +44 -0
  52. specula/processing_objects/electric_field_combinator.py +63 -0
  53. specula/processing_objects/electric_field_reflection.py +37 -0
  54. specula/processing_objects/ext_source_pyramid.py +169 -0
  55. specula/processing_objects/extended_source.py +722 -0
  56. specula/processing_objects/focal_plane_filter.py +263 -0
  57. specula/processing_objects/gain_optimizer.py +386 -0
  58. specula/processing_objects/ideal_derivative_sensor.py +266 -0
  59. specula/processing_objects/iir_filter.py +197 -0
  60. specula/processing_objects/im_calibrator.py +194 -0
  61. specula/processing_objects/integrator.py +52 -0
  62. specula/processing_objects/linear_combination.py +103 -0
  63. specula/processing_objects/low_pass_filter.py +32 -0
  64. specula/processing_objects/mirror_commands_combinator.py +99 -0
  65. specula/processing_objects/modal_analysis.py +165 -0
  66. specula/processing_objects/modalrec.py +253 -0
  67. specula/processing_objects/modalrec_implicit_polc.py +110 -0
  68. specula/processing_objects/modulated_double_roof.py +219 -0
  69. specula/processing_objects/modulated_pyramid.py +658 -0
  70. specula/processing_objects/multi_im_calibrator.py +159 -0
  71. specula/processing_objects/multi_rec_calibrator.py +83 -0
  72. specula/processing_objects/mvm.py +71 -0
  73. specula/processing_objects/optical_gain_estimator.py +130 -0
  74. specula/processing_objects/phase_flattening.py +63 -0
  75. specula/processing_objects/poly_chrom_sh.py +68 -0
  76. specula/processing_objects/poly_chrom_wfs.py +182 -0
  77. specula/processing_objects/poly_crom_pyramid.py +82 -0
  78. specula/processing_objects/power_loss.py +44 -0
  79. specula/processing_objects/psf.py +119 -0
  80. specula/processing_objects/psf_coronagraph.py +160 -0
  81. specula/processing_objects/push_pull_generator.py +67 -0
  82. specula/processing_objects/pyr_pupdata_calibrator.py +346 -0
  83. specula/processing_objects/pyr_slopec.py +132 -0
  84. specula/processing_objects/random_generator.py +64 -0
  85. specula/processing_objects/rec_calibrator.py +86 -0
  86. specula/processing_objects/schedule_generator.py +52 -0
  87. specula/processing_objects/sh.py +557 -0
  88. specula/processing_objects/sh_slopec.py +314 -0
  89. specula/processing_objects/sh_subap_calibrator.py +110 -0
  90. specula/processing_objects/slopec.py +132 -0
  91. specula/processing_objects/sn_calibrator.py +53 -0
  92. specula/processing_objects/spot_monitor.py +263 -0
  93. specula/processing_objects/time_history_generator.py +31 -0
  94. specula/processing_objects/vibration_generator.py +135 -0
  95. specula/processing_objects/wave_generator.py +75 -0
  96. specula/processing_objects/windowed_integration.py +55 -0
  97. specula/processing_objects/zernike_sensor.py +111 -0
  98. specula/simul.py +1020 -0
  99. specula/template_processing_obj.py +85 -0
  100. specula-0.0.0.dist-info/METADATA +86 -0
  101. specula-0.0.0.dist-info/RECORD +105 -0
  102. specula-0.0.0.dist-info/WHEEL +5 -0
  103. specula-0.0.0.dist-info/entry_points.txt +4 -0
  104. specula-0.0.0.dist-info/licenses/LICENSE +21 -0
  105. specula-0.0.0.dist-info/top_level.txt +1 -0
specula/__init__.py ADDED
@@ -0,0 +1,282 @@
1
+ import numpy as np
2
+ import os
3
+ import functools
4
+ from functools import wraps
5
+
6
+ cpu_float_dtype_list = [np.float64, np.float32]
7
+ cpu_complex_dtype_list = [np.complex128, np.complex64]
8
+ array_types = []
9
+
10
+ gpuEnabled = False
11
+ cp = None
12
+ xp = None
13
+ global_precision = None
14
+ float_dtype_list = None
15
+ complex_dtype_list = None
16
+ gpu_float_dtype_list = cpu_float_dtype_list
17
+ gpu_complex_dtype_list = cpu_complex_dtype_list
18
+ float_dtype = None
19
+ complex_dtype = None
20
+ default_target_device_idx = None
21
+ default_target_device = None
22
+ process_comm = None
23
+ process_rank = None
24
+ ASEC2RAD = np.pi / (3600 * 180)
25
+ RAD2ASEC = 1.0 / ASEC2RAD
26
+ MPI_DBG = False
27
+
28
+ MPI_SEND_DBG = False
29
+
30
+ # precision = 0 -> double precision
31
+ # precision = 1 -> single precision
32
+
33
+ # target_device = -1 -> CPU
34
+ # target_device = i>-1 -> GPUi
35
+
36
+ # you might have a GPU working and cupy installed
37
+ # and still want to use the CPU (idx==-1) as default_target_device
38
+ # in this case you might still want to allocate some objects on
39
+ # a GPU device (idx>=0).
40
+ # This can be checked later looking at the value of gpuEnabled.
41
+
42
+ def init(device_idx=-1, precision=0, rank=None, comm=None, mpi_dbg=False):
43
+ global xp
44
+ global cp
45
+ global gpuEnabled
46
+ global global_precision
47
+ global float_dtype_list
48
+ global complex_dtype_list
49
+ global gpu_float_dtype_list
50
+ global gpu_complex_dtype_list
51
+ global array_types
52
+ global float_dtype
53
+ global complex_dtype
54
+ global default_target_device_idx
55
+ global default_target_device
56
+ global process_comm
57
+ global process_rank
58
+ global MPI_DBG
59
+
60
+ MPI_DBG = mpi_dbg
61
+ process_comm = comm
62
+ process_rank = rank
63
+
64
+ default_target_device_idx = device_idx
65
+ systemDisable = os.environ.get('SPECULA_DISABLE_GPU', 'FALSE')
66
+ if systemDisable=='FALSE':
67
+ try:
68
+ import cupy as cp
69
+ print("Cupy import successfull. Installed version is:", cp.__version__)
70
+ gpuEnabled = True
71
+ cp = cp
72
+ except:
73
+ print("Cupy import failed. SPECULA will fall back to CPU use.")
74
+ cp = None
75
+ xp = np
76
+ default_target_device_idx=-1
77
+ else:
78
+ print("env variable SPECULA_DISABLE_GPU prevents using the GPU.")
79
+ cp = None
80
+ xp = np
81
+ default_target_device_idx=-1
82
+
83
+
84
+ if default_target_device_idx>=0:
85
+ xp = cp
86
+ gpu_float_dtype_list = [cp.float64, cp.float32]
87
+ gpu_complex_dtype_list = [cp.complex128, cp.complex64]
88
+ default_target_device = cp.cuda.Device(default_target_device_idx)
89
+ default_target_device.use()
90
+ print('Default device is GPU number ', default_target_device_idx)
91
+ # print('Using device: ', cp.cuda.runtime.getDeviceProperties(default_target_device)['name'])
92
+ # attributes = default_target_device.attributes
93
+ # properties = cp.cuda.runtime.getDeviceProperties(default_target_device)
94
+ # print('Number of multiprocessors:', attributes['MultiProcessorCount'])
95
+ # print('Global memory size (GB):', properties['totalGlobalMem'] / (1024**3))
96
+ else:
97
+ print('Default device is CPU')
98
+ xp = np
99
+
100
+ if cp is not None:
101
+ array_types = [np.ndarray, cp.ndarray]
102
+ else:
103
+ array_types = [np.ndarray]
104
+
105
+ float_dtype_list = [xp.float64, xp.float32]
106
+ complex_dtype_list = [xp.complex128, xp.complex64]
107
+ global_precision = precision
108
+ float_dtype = float_dtype_list[global_precision]
109
+ complex_dtype = complex_dtype_list[global_precision]
110
+
111
+ # Patch cupy's missing RandomState.random() method
112
+ if cp is not None:
113
+ cp.random.RandomState.random = cp.random.RandomState.random_sample
114
+
115
+
116
+ # should be used as less as a possible and preferably outside time critical computations
117
+ def cpuArray(v, dtype=None, force_copy=False):
118
+ return to_xp(np, v, dtype=dtype, force_copy=force_copy)
119
+
120
+
121
+ def to_xp(xp, v, dtype=None, force_copy=False):
122
+ '''
123
+ Make sure that v is allocated as an array on this object's device.
124
+ Works for all combinations of np and cp, whether installed or not.
125
+
126
+ Optionally casts to the required dtype (no copy is made if
127
+ the dtype is already the correct one)
128
+
129
+ The main trigger for this function is that np.array() cannot
130
+ be used on a cupy array.
131
+ '''
132
+ if xp is cp:
133
+ if isinstance(v, cp.ndarray) and not force_copy:
134
+ retval = v
135
+ else:
136
+ retval = cp.array(v)
137
+ else:
138
+ if cp is not None and isinstance(v, cp.ndarray):
139
+ retval = v.get()
140
+ elif isinstance(v, np.ndarray) and not force_copy:
141
+ # Avoid extra copy (enabled by numpy default)
142
+ retval = v
143
+ else:
144
+ retval = np.array(v)
145
+ if dtype is None and not force_copy:
146
+ return retval
147
+ else:
148
+ return retval.astype(dtype, copy=force_copy)
149
+
150
+
151
+ class DummyDecoratorAndContextManager():
152
+ def __init__(self):
153
+ pass
154
+ def __enter__(self):
155
+ pass
156
+ def __exit__(self, *args):
157
+ pass
158
+ def __call__(self, f):
159
+ def caller(*args, **kwargs):
160
+ return f(*args, **kwargs)
161
+ return caller
162
+
163
+
164
+ def show_in_profiler(message=None, color_id=None, argb_color=None, sync=False):
165
+ '''
166
+ Decorator to allow using cupy's TimeRangeDecorator
167
+ in a safe way even when cupy is not installed
168
+ Parameters are the same as TimeRangeDecorator
169
+ '''
170
+ try:
171
+ from cupyx.profiler import time_range
172
+
173
+ return time_range(message=message,
174
+ color_id=color_id,
175
+ argb_color=argb_color,
176
+ sync=sync)
177
+
178
+ except ImportError:
179
+ return DummyDecoratorAndContextManager()
180
+
181
+
182
+ def fuse(kernel_name=None):
183
+ '''
184
+ Replacement of cupy.fuse() allowing runtime
185
+ dispatch to cupy or numpy.
186
+
187
+ Fused function takes an xp argument that will
188
+ cause it to run as a fused kernel or a standard
189
+ numpy function. The xp argument can be used
190
+ inside the function as usual.
191
+
192
+ Parameters are the same as cp.fuse()
193
+ '''
194
+ def decorator(f):
195
+ f_cp = functools.partial(f, xp=cp)
196
+ f_np = functools.partial(f, xp=np)
197
+ f_cpu = f_np
198
+ if cp:
199
+ f_gpu = cp.fuse(kernel_name=kernel_name)(f_cp)
200
+ else:
201
+ f_gpu = None
202
+ @wraps(f)
203
+ def wrapper(*args, xp, **kwargs):
204
+ if xp == cp:
205
+ return f_gpu(*args, **kwargs)
206
+ else:
207
+ return f_cpu(*args, **kwargs)
208
+ return wrapper
209
+ return decorator
210
+
211
+
212
+ def main_simul(yml_files: list,
213
+ nsimul = 1,
214
+ cpu: bool=False,
215
+ overrides: str=None,
216
+ target: int=0,
217
+ profile: bool=False,
218
+ mpi: bool=False,
219
+ mpidbg: bool=False,
220
+ stepping: bool=False,
221
+ diagram: bool=False,
222
+ diagram_title: str=None,
223
+ diagram_filename: str=None,
224
+ diagram_colors_on: bool=False):
225
+
226
+ if mpi:
227
+ try:
228
+ from mpi4py import MPI
229
+ from mpi4py.util import pkl5
230
+ print("mpi4py import successfull. Installed version is:", MPI.Get_version())
231
+ except ImportError:
232
+ print("mpi4py import failed.")
233
+ raise
234
+
235
+ comm = pkl5.Intracomm(MPI.COMM_WORLD)
236
+ rank = comm.Get_rank()
237
+ N = 10000000
238
+ datatype = MPI.FLOAT
239
+ num_bytes = N * (datatype.Pack_size(count=1, comm=comm) + MPI.BSEND_OVERHEAD)
240
+
241
+ print(f'MPI buffer size: {num_bytes/1024**2:.2f} MB')
242
+ attached_buf = bytearray(num_bytes)
243
+ MPI.Attach_buffer(attached_buf)
244
+ else:
245
+ rank = None
246
+ comm = None
247
+
248
+ if cpu:
249
+ target_device_idx = -1
250
+ else:
251
+ target_device_idx = target
252
+
253
+ init(target_device_idx, precision=1, rank=rank, comm=comm, mpi_dbg=mpidbg)
254
+ from specula.simul import Simul
255
+
256
+ if profile:
257
+ import cProfile
258
+ import pstats
259
+ pr = cProfile.Profile()
260
+ pr.enable()
261
+
262
+ for simul_idx in range(nsimul):
263
+ print(yml_files)
264
+ Simul(*yml_files,
265
+ simul_idx=simul_idx,
266
+ overrides=overrides,
267
+ stepping=stepping,
268
+ diagram=diagram,
269
+ diagram_filename=diagram_filename,
270
+ diagram_title=diagram_title,
271
+ diagram_colors_on=diagram_colors_on
272
+ ).run()
273
+
274
+ if profile:
275
+ pr.disable
276
+ stats = pstats.Stats(pr).sort_stats("cumtime")
277
+ stats.print_stats(r"\((?!\_).*\)$", 200)
278
+
279
+ if mpi:
280
+ MPI.Detach_buffer()
281
+
282
+
@@ -0,0 +1,135 @@
1
+
2
+ import warnings
3
+ from copy import copy
4
+ from functools import lru_cache
5
+
6
+ from specula import cp, np, array_types
7
+ from specula.base_time_obj import BaseTimeObj
8
+
9
+
10
+ # We use lru_cache() instead of cache() for python 3.8 compatibility
11
+ @lru_cache(maxsize=None)
12
+ def get_properties(cls):
13
+ result = []
14
+ classlist = cls.__mro__
15
+ for cc in classlist:
16
+ result.extend([attr for attr, value in vars(cc).items() if isinstance(value, property) ])
17
+ return result
18
+
19
+
20
+ class BaseDataObj(BaseTimeObj):
21
+ def __init__(self, target_device_idx=None, precision=None):
22
+ """
23
+ Initialize the base data object.
24
+
25
+ Parameters:
26
+ target_device_idx: int, optional
27
+ device to be targeted for data storage. Set to -1 for CPU,
28
+ to 0 for the first GPU device, 1 for the second GPU device, etc.
29
+ precision: int, optional
30
+ if None will use the global_precision, otherwise set to 0 for double, 1 for single
31
+ """
32
+ super().__init__(target_device_idx, precision)
33
+ self.generation_time = -1
34
+ self.tag = ''
35
+
36
+ def transferDataTo(self, destobj, force_reallocation=False):
37
+ '''
38
+ Copy CPU/GPU arrays into an existing data object:
39
+ iterate over all self attributes and, if a CPU or GPU array
40
+ is detected, copy data into *destobj* without reallocating.
41
+
42
+ Destination (CPU or GPU device) is inferred by *destobj.target_device_idx*,
43
+ which must be set correctly before calling this method.
44
+ '''
45
+ # Get a list of all attributes, but skip properties
46
+ pp = get_properties(type(self))
47
+ attr_list = [attr for attr in dir(self) if attr not in pp]
48
+
49
+ for attr in attr_list:
50
+ self_attr = getattr(self, attr)
51
+ self_type = type(self_attr)
52
+ if self_type not in array_types:
53
+ continue
54
+
55
+ dest_attr = getattr(destobj, attr)
56
+ dest_type = type(dest_attr)
57
+
58
+ if dest_type not in array_types:
59
+ print(f'Warning: destination attribute is not a cupy/numpy array, forcing reallocation ({destobj}.{attr})')
60
+ force_reallocation = True
61
+
62
+ # Destination array had the correct type: perform in-place data copy
63
+ if not force_reallocation:
64
+ # Detect whether the array types are correct for all four cases:
65
+ # Device to CPU, CPU to device, device-to-device, and CPU-CPU. Also check whether
66
+ # the target_device_idx is set correctly for the destination object.
67
+ DtD = cp is not None and (self_type == cp.ndarray) and (dest_type == cp.ndarray) and destobj.target_device_idx >= 0
68
+ DtH = cp is not None and (self_type == cp.ndarray) and (dest_type == np.ndarray) and destobj.target_device_idx == -1
69
+ HtD = cp is not None and (self_type == np.ndarray) and (dest_type == cp.ndarray) and destobj.target_device_idx >= 0
70
+ HtH = (self_type == np.ndarray) and (dest_type == np.ndarray) and destobj.target_device_idx == -1
71
+ if DtD:
72
+ # Performance warnings here are expected, because we might
73
+ # trigger a peer-to-peer transfer between devices
74
+ with warnings.catch_warnings():
75
+ if self.PerformanceWarning:
76
+ warnings.simplefilter("ignore", category=self.PerformanceWarning)
77
+ try:
78
+ dest_attr[:] = self_attr
79
+ except:
80
+ dest_attr = self_attr
81
+ elif DtH:
82
+ # Do not set blocking=True for cupy 12.x compatibility.
83
+ # Blocking is True by default in later versions anyway
84
+ self_attr.get(out=dest_attr)
85
+ elif HtD:
86
+ dest_attr.set(self_attr)
87
+ elif HtH:
88
+ dest_attr[:] = self_attr
89
+ else:
90
+ print(f'Warning: mismatch between target_device_idx and array allocation, forcing reallocation ({destobj}.{attr})')
91
+ force_reallocation = True
92
+
93
+ # Otherwise, reallocate
94
+ if force_reallocation:
95
+ DtD = cp is not None and (self_type == cp.ndarray) and destobj.target_device_idx >= 0
96
+ DtH = cp is not None and (self_type == cp.ndarray) and destobj.target_device_idx == -1
97
+ HtD = (self_type == np.ndarray) and destobj.target_device_idx >= 0
98
+ HtH = (self_type == np.ndarray) and destobj.target_device_idx == -1
99
+
100
+ if DtD:
101
+ # Performance warnings here are expected, because we might
102
+ # trigger a peer-to-peer transfer between devices
103
+ with warnings.catch_warnings():
104
+ if self.PerformanceWarning:
105
+ warnings.simplefilter("ignore", category=self.PerformanceWarning)
106
+ setattr(destobj, attr, cp.asarray(self_attr))
107
+ if DtH:
108
+ # Do not set blocking=True for cupy 12.x compatibility.
109
+ # Blocking is True by default in later versions anyway
110
+ setattr(destobj, attr, self_attr.get())
111
+ if HtD:
112
+ setattr(destobj, attr, cp.asarray(self_attr))
113
+ if HtH:
114
+ setattr(destobj, attr, np.asarray(self_attr))
115
+
116
+ destobj.generation_time = self.generation_time
117
+
118
+ def copyTo(self, target_device_idx):
119
+ '''
120
+ Duplicate a data object on another device,
121
+ alllocating all CPU/GPU arrays on the new device.
122
+ '''
123
+ if target_device_idx == self.target_device_idx:
124
+ return self
125
+ else:
126
+ cloned = copy(self)
127
+
128
+ if target_device_idx >= 0:
129
+ cloned.xp = cp
130
+ else:
131
+ cloned.xp = np
132
+ cloned.target_device_idx = target_device_idx
133
+
134
+ self.transferDataTo(cloned, force_reallocation=True)
135
+ return cloned
@@ -0,0 +1,267 @@
1
+ from collections import defaultdict
2
+
3
+ from specula import cpuArray, default_target_device, cp, MPI_DBG, MPI_SEND_DBG
4
+ from specula import show_in_profiler
5
+ from specula import process_comm, process_rank
6
+ from specula.base_time_obj import BaseTimeObj
7
+ from specula.data_objects.layer import Layer
8
+
9
+
10
+ class BaseProcessingObj(BaseTimeObj):
11
+
12
+ _streams = {}
13
+
14
+ def __init__(self, target_device_idx=None, precision=None):
15
+ """
16
+ Initialize the base processing object.
17
+
18
+ Parameters:
19
+ precision (int, optional): if None will use the global_precision, otherwise pass 0 for double, 1 for single
20
+ target_device_idx (int, optional): if None will use the default_target_device_idx, otherwise pass -1 for cpu, i for GPU of index i
21
+ """
22
+ BaseTimeObj.__init__(self, target_device_idx=target_device_idx, precision=precision)
23
+
24
+ self.current_time = 0
25
+ self.current_time_seconds = 0
26
+
27
+ self.verbose = 0
28
+
29
+ # Stream/input management
30
+ self.stream = None
31
+ self.inputs_changed = False
32
+ self.cuda_graph = None
33
+
34
+ # Will be populated by derived class
35
+ self.inputs = {}
36
+ self.local_inputs = {}
37
+ self.outputs = {}
38
+ self.remote_outputs = defaultdict(list)
39
+ self.sent_valid = {}
40
+
41
+ # Use the correct CUDA device for allocations in derived classes' __init__
42
+ if self.target_device_idx >= 0:
43
+ self._target_device.use()
44
+
45
+ # Default name is none is given externally
46
+ self.name = self.__class__.__name__
47
+
48
+ # Use the correct CUDA device for allocations in derived classes' prepare_trigger()
49
+ def prepare_trigger(self, t):
50
+ self.current_time_seconds = self.t_to_seconds(self.current_time)
51
+ if self.target_device_idx >= 0:
52
+ self._target_device.use()
53
+
54
+ def addRemoteOutput(self, name, remote_output):
55
+ self.remote_outputs[name].append(remote_output)
56
+
57
+ def checkInputTimes(self):
58
+ if len(self.inputs)==0:
59
+ return True
60
+ self.get_all_inputs()
61
+ for input_name, input_obj in self.local_inputs.items():
62
+ if type(input_obj) is not list:
63
+ input_obj = [input_obj]
64
+
65
+ tt_list = [x.generation_time for x in input_obj if x is not None]
66
+ for tt in tt_list:
67
+ if tt is not None and tt >= self.current_time:
68
+ return True
69
+ else:
70
+ return False
71
+
72
+ def get_all_inputs(self):
73
+ '''
74
+ Perform get() on all inputs.
75
+ Remote inputs, if any, are received via MPI.
76
+ Data is transferred between devices if necessary.
77
+ '''
78
+ for input_name, input_obj in self.inputs.items():
79
+ if MPI_DBG: print(process_rank, 'get_all_inputs(): getting InputValue:',
80
+ input_name, flush=True)
81
+ # Set additional info for better error messages
82
+ input_obj.requesting_obj_name = self.name
83
+ input_obj.input_name = input_name
84
+ self.local_inputs[input_name] = input_obj.get(self.target_device_idx)
85
+
86
+ if MPI_DBG:
87
+ print(process_rank, self.name, 'My inputs are:')
88
+ for in_name, in_value in self.local_inputs.items():
89
+ if type(in_value) is list:
90
+ if len(in_value) > 0 and type(in_value[0]) is Layer:
91
+ print(process_rank, in_name,
92
+ [(x.generation_time, x.phaseInNm) for x in in_value],
93
+ flush=True)
94
+ else:
95
+ print(process_rank, in_name,
96
+ [(x.generation_time, x) for x in in_value],
97
+ flush=True)
98
+ else:
99
+ print(process_rank, in_name,
100
+ in_value.generation_time if in_value is not None else None,
101
+ in_value, type(in_value), flush=True)
102
+
103
+ def trigger_code(self):
104
+ '''
105
+ Any code implemented by derived classes must:
106
+ 1) only perform GPU operations using the xp module
107
+ on arrays allocated with self.xp
108
+ 2) avoid any explicity numpy or normal python operation.
109
+ 3) NOT use any value in variables that are reallocated by prepare_trigger() or post_trigger(),
110
+ and in general avoid any value defined outside this class (like object inputs)
111
+
112
+ because if stream capture is used, a CUDA graph will be generated that will skip
113
+ over any non-GPU operation and re-use GPU memory addresses of its first run.
114
+
115
+ Defining local variables inside this function is OK, they will persist in GPU memory.
116
+ '''
117
+ pass
118
+
119
+ def post_trigger(self):
120
+ '''
121
+ Make sure we are using the correct device and that any previous
122
+ CUDA graph has been synchronized
123
+ '''
124
+ # Double check that we can execute
125
+ if not self.inputs_changed:
126
+ raise RuntimeError("trigger() called when the object's inputs have not changed")
127
+
128
+ # Reset inputs flag
129
+ self.inputs_changed = False
130
+
131
+ if self.target_device_idx>=0:
132
+ self._target_device.use()
133
+ if self.cuda_graph:
134
+ self.stream.synchronize()
135
+
136
+
137
+ def send_remote_output(self, item, dest_rank, dest_tag, first_mpi_send=True, out_name=''):
138
+ if MPI_SEND_DBG: print(process_rank, f'SEND to rank {dest_rank} {dest_tag=} {(dest_tag in self.sent_valid)=} (from {self.name}.{out_name})', flush=True)
139
+ if first_mpi_send or not dest_tag in self.sent_valid:
140
+ if MPI_SEND_DBG: print(process_rank, 'SEND with Pickle', dest_tag, flush=True)
141
+ xp_orig = item.xp
142
+ item.xp = 0
143
+ process_comm.ibsend(item, dest=dest_rank, tag=dest_tag)
144
+ item.xp = xp_orig
145
+ else:
146
+ buffer = item.get_value()
147
+ if MPI_SEND_DBG: print(process_rank, dest_tag, 'SEND .device', buffer.device)
148
+ if MPI_SEND_DBG: print(process_rank, 'SEND with Buffer', dest_tag, type(buffer), buffer, flush=True)
149
+ if MPI_SEND_DBG: print(process_rank, 'SEND with Buffer type', dest_tag, buffer.dtype, flush=True)
150
+
151
+ process_comm.Ibsend(cpuArray(buffer), dest=dest_rank, tag=dest_tag)
152
+
153
+ process_comm.ibsend(item.generation_time, dest=dest_rank, tag=dest_tag+1)
154
+ if item.get_value() is not None:
155
+ self.sent_valid[dest_tag] = True
156
+
157
+
158
+ # this method implements the mpi send call of the outputs connected to remote inputs
159
+ def send_outputs(self, skip_delayed=False, delayed_only=False, first_mpi_send=True):
160
+ '''
161
+ Send all remote outputs via MPI.
162
+ If *skip_delayed* is True, skip sending all delayed outputs.
163
+ Used during the last iteration when the simulation is ending and
164
+ no one would receive the delayed inputs.
165
+ If *delayed_only* is True, only send the delayed outputs.
166
+ Used while setting up the simulation, to initialize outputs
167
+ that are delayed and thus would not be received otherwise.
168
+ '''
169
+ if MPI_DBG:
170
+ print(process_rank, self.name, 'My outputs are:')
171
+ for out_name, out_value in self.outputs.items():
172
+ print(process_rank, out_name, out_value, flush=True)
173
+
174
+ if MPI_DBG: print(process_rank, 'send_outputs', flush=True)
175
+ for out_name, remote_specs in self.remote_outputs.items():
176
+ for remote_spec in remote_specs:
177
+ dest_rank, dest_tag, delay = remote_spec
178
+ # avoid sending outputs that will not be received
179
+ # because the simulation is ending
180
+ if delay < 0 and skip_delayed:
181
+ if MPI_SEND_DBG: print(process_rank, f'SKIPPED SEND to rank {dest_rank} {dest_tag=} due to delay={delay}', flush=True)
182
+ continue
183
+ if delay >= 0 and delayed_only:
184
+ if MPI_SEND_DBG: print(process_rank, f'SKIPPED SEND to rank {dest_rank} {dest_tag=} due to delay={delay}', flush=True)
185
+ continue
186
+ if MPI_DBG: print(process_rank, 'Sending ', out_name, 'to ', dest_rank, 'with tag', dest_tag, type(self.outputs[out_name]), flush=True)
187
+ # workaround because module objects cannot be pickled
188
+ for item in self.outputs[out_name] if isinstance(self.outputs[out_name], list) else [self.outputs[out_name]]:
189
+ self.send_remote_output(item, dest_rank, dest_tag, first_mpi_send, out_name)
190
+
191
+ @classmethod
192
+ def device_stream(cls, target_device_idx):
193
+ if not target_device_idx in cls._streams:
194
+ cls._streams[target_device_idx] = cp.cuda.Stream(non_blocking=False)
195
+ return cls._streams[target_device_idx]
196
+
197
+ def build_stream(self, allow_parallel=True):
198
+ if self.target_device_idx>=0:
199
+ self._target_device.use()
200
+ if allow_parallel:
201
+ self.stream = cp.cuda.Stream(non_blocking=False)
202
+ else:
203
+ self.stream = self.device_stream(self.target_device_idx)
204
+ self.capture_stream()
205
+ default_target_device.use()
206
+
207
+ def capture_stream(self):
208
+ with self.stream:
209
+ # First execution is needed to build the FFT plan cache
210
+ # See for example https://github.com/cupy/cupy/issues/7559
211
+ self.trigger_code()
212
+ self.stream.begin_capture()
213
+ self.trigger_code()
214
+ self.cuda_graph = self.stream.end_capture()
215
+
216
+ def check_ready(self, t):
217
+ self.current_time = t
218
+ if self.target_device_idx >= 0:
219
+ self._target_device.use()
220
+ if self.checkInputTimes():
221
+ self.inputs_changed = True # Signal ready for trigger and post_trigger()
222
+ self.prepare_trigger(t)
223
+ else:
224
+ self.inputs_changed = False
225
+ if self.verbose:
226
+ print('No inputs have been refreshed, skipping trigger')
227
+ return self.inputs_changed
228
+
229
+ def trigger(self):
230
+ # Double check that we can execute
231
+ if not self.inputs_changed:
232
+ raise RuntimeError("trigger() called when the object's inputs have not changed")
233
+
234
+ with show_in_profiler(self.__class__.__name__+'.trigger'):
235
+ if self.target_device_idx>=0:
236
+ self._target_device.use()
237
+ if self.target_device_idx>=0 and self.cuda_graph:
238
+ self.cuda_graph.launch(stream=self.stream)
239
+ else:
240
+ self.trigger_code()
241
+
242
+ def setup(self):
243
+ """
244
+ Override this method to perform any setup
245
+ just before the simulation is started.
246
+
247
+ The base class implementation also checks that
248
+ all non-optional inputs have been set.
249
+
250
+ """
251
+ if self.target_device_idx >= 0:
252
+ self._target_device.use()
253
+
254
+ self.get_all_inputs()
255
+ for input_name, input in self.inputs.items():
256
+ if self.local_inputs[input_name] is None and not input.optional:
257
+ raise ValueError(f'Input {input_name} for object {self} has not been set')
258
+
259
+ def finalize(self):
260
+ '''
261
+ Override this method to perform any actions after
262
+ the simulation is completed
263
+ '''
264
+ pass
265
+
266
+
267
+