nabu 2023.2.1__py3-none-any.whl → 2024.1.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. doc/conf.py +1 -1
  2. doc/doc_config.py +32 -0
  3. nabu/__init__.py +2 -1
  4. nabu/app/bootstrap_stitching.py +1 -1
  5. nabu/app/cli_configs.py +122 -2
  6. nabu/app/composite_cor.py +27 -2
  7. nabu/app/correct_rot.py +70 -0
  8. nabu/app/create_distortion_map_from_poly.py +42 -18
  9. nabu/app/diag_to_pix.py +358 -0
  10. nabu/app/diag_to_rot.py +449 -0
  11. nabu/app/generate_header.py +4 -3
  12. nabu/app/histogram.py +2 -2
  13. nabu/app/multicor.py +6 -1
  14. nabu/app/parse_reconstruction_log.py +151 -0
  15. nabu/app/prepare_weights_double.py +83 -22
  16. nabu/app/reconstruct.py +5 -1
  17. nabu/app/reconstruct_helical.py +7 -0
  18. nabu/app/reduce_dark_flat.py +6 -3
  19. nabu/app/rotate.py +4 -4
  20. nabu/app/stitching.py +16 -2
  21. nabu/app/tests/test_reduce_dark_flat.py +18 -2
  22. nabu/app/validator.py +4 -4
  23. nabu/cuda/convolution.py +8 -376
  24. nabu/cuda/fft.py +4 -0
  25. nabu/cuda/kernel.py +4 -4
  26. nabu/cuda/medfilt.py +5 -158
  27. nabu/cuda/padding.py +5 -71
  28. nabu/cuda/processing.py +23 -2
  29. nabu/cuda/src/ElementOp.cu +78 -0
  30. nabu/cuda/src/backproj.cu +28 -2
  31. nabu/cuda/src/fourier_wavelets.cu +2 -2
  32. nabu/cuda/src/normalization.cu +23 -0
  33. nabu/cuda/src/padding.cu +2 -2
  34. nabu/cuda/src/transpose.cu +16 -0
  35. nabu/cuda/utils.py +39 -0
  36. nabu/estimation/alignment.py +10 -1
  37. nabu/estimation/cor.py +808 -38
  38. nabu/estimation/cor_sino.py +7 -9
  39. nabu/estimation/tests/test_cor.py +85 -3
  40. nabu/io/reader.py +26 -18
  41. nabu/io/tests/test_cast_volume.py +3 -3
  42. nabu/io/tests/test_detector_distortion.py +3 -3
  43. nabu/io/tiffwriter_zmm.py +2 -2
  44. nabu/io/utils.py +14 -4
  45. nabu/io/writer.py +5 -3
  46. nabu/misc/fftshift.py +6 -0
  47. nabu/misc/histogram.py +5 -285
  48. nabu/misc/histogram_cuda.py +8 -104
  49. nabu/misc/kernel_base.py +3 -121
  50. nabu/misc/padding_base.py +5 -69
  51. nabu/misc/processing_base.py +3 -107
  52. nabu/misc/rotation.py +5 -62
  53. nabu/misc/rotation_cuda.py +5 -65
  54. nabu/misc/transpose.py +6 -0
  55. nabu/misc/unsharp.py +3 -78
  56. nabu/misc/unsharp_cuda.py +5 -52
  57. nabu/misc/unsharp_opencl.py +8 -85
  58. nabu/opencl/fft.py +6 -0
  59. nabu/opencl/kernel.py +21 -6
  60. nabu/opencl/padding.py +5 -72
  61. nabu/opencl/processing.py +27 -5
  62. nabu/opencl/src/backproj.cl +3 -3
  63. nabu/opencl/src/fftshift.cl +65 -12
  64. nabu/opencl/src/padding.cl +2 -2
  65. nabu/opencl/src/roll.cl +96 -0
  66. nabu/opencl/src/transpose.cl +16 -0
  67. nabu/pipeline/config_validators.py +63 -3
  68. nabu/pipeline/dataset_validator.py +2 -2
  69. nabu/pipeline/estimators.py +193 -35
  70. nabu/pipeline/fullfield/chunked.py +34 -17
  71. nabu/pipeline/fullfield/chunked_cuda.py +7 -5
  72. nabu/pipeline/fullfield/computations.py +48 -13
  73. nabu/pipeline/fullfield/nabu_config.py +13 -13
  74. nabu/pipeline/fullfield/processconfig.py +10 -5
  75. nabu/pipeline/fullfield/reconstruction.py +1 -2
  76. nabu/pipeline/helical/fbp.py +5 -0
  77. nabu/pipeline/helical/filtering.py +12 -9
  78. nabu/pipeline/helical/gridded_accumulator.py +179 -33
  79. nabu/pipeline/helical/helical_chunked_regridded.py +262 -151
  80. nabu/pipeline/helical/helical_chunked_regridded_cuda.py +4 -11
  81. nabu/pipeline/helical/helical_reconstruction.py +56 -18
  82. nabu/pipeline/helical/span_strategy.py +1 -1
  83. nabu/pipeline/helical/tests/test_accumulator.py +4 -0
  84. nabu/pipeline/params.py +23 -2
  85. nabu/pipeline/processconfig.py +3 -8
  86. nabu/pipeline/tests/test_chunk_reader.py +78 -0
  87. nabu/pipeline/tests/test_estimators.py +120 -2
  88. nabu/pipeline/utils.py +25 -0
  89. nabu/pipeline/writer.py +2 -0
  90. nabu/preproc/ccd_cuda.py +9 -7
  91. nabu/preproc/ctf.py +21 -26
  92. nabu/preproc/ctf_cuda.py +25 -25
  93. nabu/preproc/double_flatfield.py +14 -2
  94. nabu/preproc/double_flatfield_cuda.py +7 -11
  95. nabu/preproc/flatfield_cuda.py +23 -27
  96. nabu/preproc/phase.py +19 -24
  97. nabu/preproc/phase_cuda.py +21 -21
  98. nabu/preproc/shift_cuda.py +58 -28
  99. nabu/preproc/tests/test_ctf.py +5 -5
  100. nabu/preproc/tests/test_double_flatfield.py +2 -2
  101. nabu/preproc/tests/test_vshift.py +13 -2
  102. nabu/processing/__init__.py +0 -0
  103. nabu/processing/convolution_cuda.py +375 -0
  104. nabu/processing/fft_base.py +163 -0
  105. nabu/processing/fft_cuda.py +256 -0
  106. nabu/processing/fft_opencl.py +54 -0
  107. nabu/processing/fftshift.py +134 -0
  108. nabu/processing/histogram.py +286 -0
  109. nabu/processing/histogram_cuda.py +103 -0
  110. nabu/processing/kernel_base.py +126 -0
  111. nabu/processing/medfilt_cuda.py +159 -0
  112. nabu/processing/muladd.py +29 -0
  113. nabu/processing/muladd_cuda.py +68 -0
  114. nabu/processing/padding_base.py +71 -0
  115. nabu/processing/padding_cuda.py +75 -0
  116. nabu/processing/padding_opencl.py +77 -0
  117. nabu/processing/processing_base.py +123 -0
  118. nabu/processing/roll_opencl.py +64 -0
  119. nabu/processing/rotation.py +63 -0
  120. nabu/processing/rotation_cuda.py +66 -0
  121. nabu/processing/tests/__init__.py +0 -0
  122. nabu/processing/tests/test_fft.py +268 -0
  123. nabu/processing/tests/test_fftshift.py +71 -0
  124. nabu/{misc → processing}/tests/test_histogram.py +2 -4
  125. nabu/{cuda → processing}/tests/test_medfilt.py +1 -1
  126. nabu/processing/tests/test_muladd.py +54 -0
  127. nabu/{cuda → processing}/tests/test_padding.py +119 -75
  128. nabu/processing/tests/test_roll.py +63 -0
  129. nabu/{misc → processing}/tests/test_rotation.py +3 -2
  130. nabu/processing/tests/test_transpose.py +72 -0
  131. nabu/{misc → processing}/tests/test_unsharp.py +41 -8
  132. nabu/processing/transpose.py +126 -0
  133. nabu/processing/unsharp.py +79 -0
  134. nabu/processing/unsharp_cuda.py +53 -0
  135. nabu/processing/unsharp_opencl.py +75 -0
  136. nabu/reconstruction/fbp.py +34 -10
  137. nabu/reconstruction/fbp_base.py +35 -16
  138. nabu/reconstruction/fbp_opencl.py +7 -12
  139. nabu/reconstruction/filtering.py +2 -2
  140. nabu/reconstruction/filtering_cuda.py +13 -14
  141. nabu/reconstruction/filtering_opencl.py +3 -4
  142. nabu/reconstruction/projection.py +2 -0
  143. nabu/reconstruction/rings.py +158 -1
  144. nabu/reconstruction/rings_cuda.py +218 -58
  145. nabu/reconstruction/sinogram_cuda.py +16 -12
  146. nabu/reconstruction/tests/test_deringer.py +116 -14
  147. nabu/reconstruction/tests/test_fbp.py +22 -31
  148. nabu/reconstruction/tests/test_filtering.py +11 -2
  149. nabu/resources/dataset_analyzer.py +89 -26
  150. nabu/resources/nxflatfield.py +2 -2
  151. nabu/resources/tests/test_nxflatfield.py +1 -1
  152. nabu/resources/utils.py +9 -2
  153. nabu/stitching/alignment.py +184 -0
  154. nabu/stitching/config.py +241 -39
  155. nabu/stitching/definitions.py +6 -0
  156. nabu/stitching/frame_composition.py +4 -2
  157. nabu/stitching/overlap.py +99 -3
  158. nabu/stitching/sample_normalization.py +60 -0
  159. nabu/stitching/slurm_utils.py +10 -10
  160. nabu/stitching/tests/test_alignment.py +99 -0
  161. nabu/stitching/tests/test_config.py +16 -1
  162. nabu/stitching/tests/test_overlap.py +68 -2
  163. nabu/stitching/tests/test_sample_normalization.py +49 -0
  164. nabu/stitching/tests/test_slurm_utils.py +5 -5
  165. nabu/stitching/tests/test_utils.py +3 -33
  166. nabu/stitching/tests/test_z_stitching.py +391 -22
  167. nabu/stitching/utils.py +144 -202
  168. nabu/stitching/z_stitching.py +309 -126
  169. nabu/testutils.py +18 -0
  170. nabu/thirdparty/tomocupy_remove_stripe.py +586 -0
  171. nabu/utils.py +32 -6
  172. {nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/LICENSE +1 -1
  173. {nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/METADATA +5 -5
  174. nabu-2024.1.0rc3.dist-info/RECORD +296 -0
  175. {nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/WHEEL +1 -1
  176. {nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/entry_points.txt +5 -1
  177. nabu/conftest.py +0 -14
  178. nabu/opencl/fftshift.py +0 -92
  179. nabu/opencl/tests/test_fftshift.py +0 -55
  180. nabu/opencl/tests/test_padding.py +0 -84
  181. nabu-2023.2.1.dist-info/RECORD +0 -252
  182. /nabu/cuda/src/{fftshift.cu → dfi_fftshift.cu} +0 -0
  183. {nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,25 @@
1
1
  import numpy as np
2
- from math import floor
3
- from .shift import VerticalShift
4
2
  from ..cuda.utils import __has_pycuda__
5
-
6
- if __has_pycuda__:
7
- import pycuda.gpuarray as garray
3
+ from ..cuda.processing import CudaProcessing
4
+ from ..processing.muladd_cuda import CudaMulAdd
5
+ from .shift import VerticalShift
8
6
 
9
7
 
10
8
  class CudaVerticalShift(VerticalShift):
11
- def __init__(self, radios_shape, shifts):
9
+ def __init__(self, radios_shape, shifts, **cuda_options):
12
10
  """
13
11
  Vertical Shifter, Cuda backend.
14
12
  """
15
13
  super().__init__(radios_shape, shifts)
14
+ self.cuda_processing = CudaProcessing(**(cuda_options or {}))
16
15
  self._init_cuda_arrays()
17
16
 
18
17
  def _init_cuda_arrays(self):
19
18
  interp_infos_arr = np.zeros((len(self.interp_infos), 2), "f")
20
- self._d_interp_infos = garray.to_gpu(interp_infos_arr)
21
- self._d_radio_tmp = garray.zeros(self.radios_shape[1:], "f")
19
+ self._d_interp_infos = self.cuda_processing.to_device("_d_interp_infos", interp_infos_arr)
20
+ self._d_radio_new = self.cuda_processing.allocate_array("_d_radio_new", self.radios_shape[1:], "f")
21
+ self._d_radio = self.cuda_processing.allocate_array("_d_radio", self.radios_shape[1:], "f")
22
+ self.muladd_kernel = CudaMulAdd(ctx=self.cuda_processing.ctx)
22
23
 
23
24
  def apply_vertical_shifts(self, radios, iangles, output=None):
24
25
  """
@@ -35,38 +36,67 @@ class CudaVerticalShift(VerticalShift):
35
36
  Must be of the same shape of `radios`.
36
37
  """
37
38
  self._check(radios, iangles)
38
- n_z = self.radios_shape[1]
39
+ n_a, n_z, n_x = radios.shape
40
+ assert n_z == self.radios_shape[1]
41
+ x_slice = slice(0, n_x) # slice(None, None)
42
+
43
+ def nonempty_subregion(region):
44
+ if region is None:
45
+ return True
46
+ z_slice = region[0]
47
+ return z_slice.stop - z_slice.start > 0
48
+
49
+ d_radio_new = self._d_radio_new
50
+ d_radio = self._d_radio
39
51
 
40
52
  for ia in iangles:
41
- radio = radios[ia]
42
- self._d_radio_tmp.fill(0)
53
+ d_radio_new.fill(0)
54
+ d_radio[:] = radios[ia, :, :] # mul-add kernel won't work with pycuda view
43
55
  S0, f = self.interp_infos[ia]
44
- s0 = S0
56
+ f = np.float32(f)
45
57
 
58
+ s0 = S0
46
59
  if s0 > 0:
47
- self._d_radio_tmp[:-s0] = radio[s0:]
48
- self._d_radio_tmp[:-s0] *= 1 - f
49
-
60
+ # newradio[:-s0] = radio[s0:] * (1 - f)
61
+ dst_region = (slice(0, n_z - s0), x_slice)
62
+ other_region = (slice(s0, n_z), x_slice)
50
63
  elif s0 == 0:
51
- self._d_radio_tmp[:] = radio[s0:]
52
- self._d_radio_tmp[:] *= 1 - f
64
+ # newradio[:] = radio[s0:] * (1 - f)
65
+ dst_region = None
66
+ other_region = (slice(s0, n_z), x_slice)
53
67
  else:
54
- self._d_radio_tmp[-s0:] = radio[:s0]
55
- self._d_radio_tmp[-s0:] *= 1 - f
68
+ # newradio[-s0:] = radio[:s0] * (1 - f)
69
+ dst_region = (slice(-s0, n_z), x_slice)
70
+ other_region = (slice(0, n_z + s0), x_slice)
56
71
 
57
- s0 = S0 + 1
58
- f = np.float32(f)
72
+ if all([nonempty_subregion(reg) for reg in [dst_region, other_region]]):
73
+ self.muladd_kernel(
74
+ d_radio_new,
75
+ d_radio,
76
+ 1,
77
+ 1 - f,
78
+ dst_region=dst_region,
79
+ other_region=other_region,
80
+ )
59
81
 
60
- # "radios[] * f" is out of place but 2D
82
+ s0 = S0 + 1
61
83
  if s0 > 0:
62
- if s0 < n_z:
63
- self._d_radio_tmp[:-s0] += radio[s0:] * f
84
+ # newradio[:-s0] += radio[s0:] * f
85
+ dst_region = (slice(0, n_z - s0), x_slice)
86
+ other_region = (slice(s0, n_z), x_slice)
64
87
  elif s0 == 0:
65
- self._d_radio_tmp[:] += radio[s0:] * f
88
+ # newradio[:] += radio[s0:] * f
89
+ dst_region = None
90
+ other_region = (slice(s0, n_z), x_slice)
66
91
  else:
67
- self._d_radio_tmp[-s0:] += radio[:s0] * f
92
+ # newradio[-s0:] += radio[:s0] * f
93
+ dst_region = (slice(-s0, n_z), x_slice)
94
+ other_region = (slice(0, n_z + s0), x_slice)
95
+
96
+ if all([nonempty_subregion(reg) for reg in [dst_region, other_region]]):
97
+ self.muladd_kernel(d_radio_new, d_radio, 1, f, dst_region=dst_region, other_region=other_region)
68
98
 
69
99
  if output is None:
70
- radios[ia, :, :] = self._d_radio_tmp[:]
100
+ radios[ia, :, :] = d_radio_new[:, :]
71
101
  else:
72
- output[ia, :, :] = self._d_radio_tmp[:]
102
+ output[ia, :, :] = d_radio_new[:, :]
@@ -213,11 +213,11 @@ class TestCtf:
213
213
  phase_r2c = ctf_numpy.retrieve_phase(img)
214
214
  self.check_result(phase_r2c, self.ref_plain, "Something wrong with CtfFilter-R2C")
215
215
 
216
- # Test FFTW
217
- ctf_fftw = ctf.CtfFilter(*ctf_args, **ctf_kwargs, use_rfft=True, fftw_num_threads=-1)
218
- if ctf_fftw.use_rfft:
219
- phase_fftw = ctf_fftw.retrieve_phase(img)
220
- self.check_result(phase_r2c, self.ref_plain, "Something wrong with CtfFilter-FFTW")
216
+ # Test multi-core FFT
217
+ ctf_fft = ctf.CtfFilter(*ctf_args, **ctf_kwargs, use_rfft=True, fft_num_threads=0)
218
+ if ctf_fft.use_rfft:
219
+ phase_fft = ctf_fft.retrieve_phase(img)
220
+ self.check_result(phase_r2c, self.ref_plain, "Something wrong with CtfFilter-FFT")
221
221
 
222
222
  @pytest.mark.skipif(not (__has_pycuda__ and __has_cufft__), reason="pycuda and scikit-cuda")
223
223
  def test_cuda_ctf(self):
@@ -4,7 +4,7 @@ import tempfile
4
4
  import numpy as np
5
5
  import pytest
6
6
  from silx.io.url import DataUrl
7
- from tomoscan.esrf.mock import MockHDF5
7
+ from tomoscan.esrf.mock import MockNXtomo
8
8
  from nabu.io.reader import HDF5Reader
9
9
  from nabu.preproc.double_flatfield import DoubleFlatField
10
10
  from nabu.cuda.utils import __has_pycuda__, get_cuda_context
@@ -20,7 +20,7 @@ def bootstrap(request):
20
20
  cls.tmpdir = tempfile.TemporaryDirectory()
21
21
  dname = cls.tmpdir.name
22
22
  cls.dname = dname
23
- radios = MockHDF5(
23
+ radios = MockNXtomo(
24
24
  path.join(dname, "tmp"),
25
25
  10,
26
26
  n_ini_proj=10,
@@ -5,7 +5,8 @@ from nabu.preproc.shift import VerticalShift
5
5
  from nabu.cuda.utils import __has_pycuda__, get_cuda_context
6
6
 
7
7
  if __has_pycuda__:
8
- from nabu.preproc.shift_cuda import CudaVerticalShift, garray
8
+ import pycuda.gpuarray as garray
9
+ from nabu.preproc.shift_cuda import CudaVerticalShift
9
10
 
10
11
 
11
12
  @pytest.fixture(scope="class")
@@ -51,12 +52,22 @@ class TestVerticalShift:
51
52
  @pytest.mark.skipif(not (__has_pycuda__), reason="Need cuda/pycuda for this test")
52
53
  def test_cuda_vshift(self):
53
54
  d_radios = garray.to_gpu(self.radios)
55
+ d_radios2 = d_radios.copy()
54
56
  d_out = garray.zeros_like(d_radios)
55
57
 
56
58
  Shifter = CudaVerticalShift(d_radios.shape, self.shifts)
57
-
58
59
  Shifter.apply_vertical_shifts(d_radios, self.indexes, output=d_out)
59
60
  assert abs(d_out.get() - self.golden).max() < self.tol
60
61
 
61
62
  Shifter.apply_vertical_shifts(d_radios, self.indexes)
62
63
  assert abs(d_radios.get() - self.golden).max() < self.tol
64
+
65
+ # Test with negative shifts
66
+ radios2 = self.radios.copy()
67
+ Shifter_neg = VerticalShift(self.radios.shape, -self.shifts)
68
+ Shifter_neg.apply_vertical_shifts(radios2, self.indexes)
69
+
70
+ Shifter_neg_cuda = CudaVerticalShift(d_radios.shape, -self.shifts)
71
+ Shifter_neg_cuda.apply_vertical_shifts(d_radios2, self.indexes)
72
+ err_max = np.max(np.abs(d_radios2.get() - radios2))
73
+ assert err_max < 1e-6, "Something wrong for negative translations: max error = %.2e" % err_max
File without changes
@@ -0,0 +1,375 @@
1
+ from os.path import dirname
2
+ import numpy as np
3
+ from ..utils import updiv, get_cuda_srcfile
4
+ from ..cuda.utils import __has_pycuda__
5
+ from ..misc.utils import ConvolutionInfos
6
+ from ..cuda.processing import CudaProcessing
7
+
8
+ if __has_pycuda__:
9
+ from pycuda.compiler import SourceModule
10
+
11
+
12
+ class Convolution:
13
+ """
14
+ A class for performing convolution on GPU with CUDA, but without using
15
+ textures (unlike for example in ``silx.opencl.convolution``)
16
+ """
17
+
18
+ def __init__(self, shape, kernel, axes=None, mode=None, extra_options=None, cuda_options=None):
19
+ """
20
+ Constructor of Cuda Convolution.
21
+
22
+ Parameters
23
+ -----------
24
+ shape: tuple
25
+ Shape of the array.
26
+ kernel: array-like
27
+ Convolution kernel (1D, 2D or 3D).
28
+ axes: tuple, optional
29
+ Axes along which the convolution is performed,
30
+ for batched convolutions.
31
+ mode: str, optional
32
+ Boundary handling mode. Available modes are:
33
+ - "reflect": cba|abcd|dcb
34
+ - "nearest": aaa|abcd|ddd
35
+ - "wrap": bcd|abcd|abc
36
+ - "constant": 000|abcd|000
37
+
38
+ Default is "reflect".
39
+ extra_options: dict, optional
40
+ Advanced options (dict). Current options are:
41
+ - "allocate_input_array": True
42
+ - "allocate_output_array": True
43
+ - "allocate_tmp_array": True
44
+ - "sourcemodule_kwargs": {}
45
+ - "batch_along_flat_dims": True
46
+ """
47
+ self.cuda = CudaProcessing(**(cuda_options or {}))
48
+ self._configure_extra_options(extra_options)
49
+ self._determine_use_case(shape, kernel, axes)
50
+ self._allocate_memory(mode)
51
+ self._init_kernels()
52
+
53
+ def _configure_extra_options(self, extra_options):
54
+ self.extra_options = {
55
+ "allocate_input_array": True,
56
+ "allocate_output_array": True,
57
+ "allocate_tmp_array": True,
58
+ "sourcemodule_kwargs": {},
59
+ "batch_along_flat_dims": True,
60
+ }
61
+ extra_opts = extra_options or {}
62
+ self.extra_options.update(extra_opts)
63
+ self.sourcemodule_kwargs = self.extra_options["sourcemodule_kwargs"]
64
+
65
+ def _get_dimensions(self, shape, kernel):
66
+ self.shape = shape
67
+ self.data_ndim = self._check_dimensions(shape=shape, name="Data")
68
+ self.kernel_ndim = self._check_dimensions(arr=kernel, name="Kernel")
69
+ Nx = shape[-1]
70
+ if self.data_ndim >= 2:
71
+ Ny = shape[-2]
72
+ else:
73
+ Ny = 1
74
+ if self.data_ndim >= 3:
75
+ Nz = shape[-3]
76
+ else:
77
+ Nz = 1
78
+ self.Nx = np.int32(Nx)
79
+ self.Ny = np.int32(Ny)
80
+ self.Nz = np.int32(Nz)
81
+
82
+ def _determine_use_case(self, shape, kernel, axes):
83
+ """
84
+ Determine the convolution use case from the input/kernel shape, and axes.
85
+ """
86
+ self._get_dimensions(shape, kernel)
87
+ if self.kernel_ndim > self.data_ndim:
88
+ raise ValueError("Kernel dimensions cannot exceed data dimensions")
89
+ data_ndim = self.data_ndim
90
+ kernel_ndim = self.kernel_ndim
91
+ self.kernel = kernel.astype("f")
92
+
93
+ convol_infos = ConvolutionInfos()
94
+ k = (data_ndim, kernel_ndim)
95
+ if k not in convol_infos.use_cases:
96
+ raise ValueError(
97
+ "Cannot find a use case for data ndim = %d and kernel ndim = %d" % (data_ndim, kernel_ndim)
98
+ )
99
+ possible_use_cases = convol_infos.use_cases[k]
100
+
101
+ # If some dimensions are "flat", make a batched convolution along them
102
+ # Ex. data_dim = (1, Nx) -> batched 1D convolution
103
+ if self.extra_options["batch_along_flat_dims"] and (1 in self.shape):
104
+ axes = tuple([curr_dim for numels, curr_dim in zip(self.shape, range(len(self.shape))) if numels != 1])
105
+ #
106
+ self.use_case_name = None
107
+ for uc_name, uc_params in possible_use_cases.items():
108
+ if axes in convol_infos.allowed_axes[uc_name]:
109
+ self.use_case_name = uc_name
110
+ self.use_case_desc = uc_params["name"]
111
+ self.use_case_kernels = uc_params["kernels"].copy()
112
+ if self.use_case_name is None:
113
+ raise ValueError(
114
+ "Cannot find a use case for data ndim = %d, kernel ndim = %d and axes=%s"
115
+ % (data_ndim, kernel_ndim, str(axes))
116
+ )
117
+ # TODO implement this use case
118
+ if self.use_case_name == "batched_separable_2D_1D_3D":
119
+ raise NotImplementedError("The use case %s is not implemented" % self.use_case_name)
120
+ #
121
+ self.axes = axes
122
+ # Replace "axes=None" with an actual value (except for ND-ND)
123
+ allowed_axes = convol_infos.allowed_axes[self.use_case_name]
124
+ if len(allowed_axes) > 1:
125
+ # The default choice might impact perfs
126
+ self.axes = allowed_axes[0] or allowed_axes[1]
127
+ self.separable = self.use_case_name.startswith("separable")
128
+ self.batched = self.use_case_name.startswith("batched")
129
+
130
+ def _allocate_memory(self, mode):
131
+ self.mode = mode or "reflect"
132
+ # The current implementation does not support kernel size bigger than data size,
133
+ # except for mode="nearest"
134
+ for i, dim_size in enumerate(self.shape):
135
+ if min(self.kernel.shape) > dim_size and i in self.axes:
136
+ print(
137
+ "Warning: kernel support is too large for data dimension %d (%d). Forcing convolution mode to 'nearest'"
138
+ % (i, dim_size)
139
+ )
140
+ self.mode = "nearest"
141
+ #
142
+ option_array_names = {
143
+ "allocate_input_array": "data_in",
144
+ "allocate_output_array": "data_out",
145
+ "allocate_tmp_array": "data_tmp",
146
+ }
147
+ # Nonseparable transforms do not need tmp array
148
+ if not (self.separable):
149
+ self.extra_options["allocate_tmp_array"] = False
150
+ # Allocate arrays
151
+ for option_name, array_name in option_array_names.items():
152
+ if self.extra_options[option_name]:
153
+ value = self.cuda.allocate_array("value", self.shape, np.float32)
154
+ else:
155
+ value = None
156
+ setattr(self, array_name, value)
157
+
158
+ if isinstance(self.kernel, np.ndarray):
159
+ self.d_kernel = self.cuda.to_device("d_kernel", self.kernel)
160
+ else:
161
+ if not (isinstance(self.kernel, self.cuda.array_class)):
162
+ raise ValueError("kernel must be either numpy array or pycuda array")
163
+ self.d_kernel = self.kernel
164
+ self._old_input_ref = None
165
+ self._old_output_ref = None
166
+ self._c_modes_mapping = {
167
+ "periodic": 2,
168
+ "wrap": 2,
169
+ "nearest": 1,
170
+ "replicate": 1,
171
+ "reflect": 0,
172
+ "constant": 3,
173
+ }
174
+ mp = self._c_modes_mapping
175
+ if self.mode.lower() not in mp:
176
+ raise ValueError(
177
+ """
178
+ Mode %s is not available. Available modes are:
179
+ %s
180
+ """
181
+ % (self.mode, str(mp.keys()))
182
+ )
183
+ if self.mode.lower() == "constant":
184
+ raise NotImplementedError("mode='constant' is not implemented yet")
185
+ self._c_conv_mode = mp[self.mode]
186
+
187
+ def _init_kernels(self):
188
+ if self.kernel_ndim > 1:
189
+ if np.abs(np.diff(self.kernel.shape)).max() > 0:
190
+ raise NotImplementedError("Non-separable convolution with non-square kernels is not implemented yet")
191
+ # Compile source module
192
+ compile_options = [str("-DUSED_CONV_MODE=%d" % self._c_conv_mode)]
193
+ fname = get_cuda_srcfile("convolution.cu")
194
+ nabu_cuda_dir = dirname(fname)
195
+ include_dirs = [nabu_cuda_dir]
196
+ self.sourcemodule_kwargs["options"] = compile_options
197
+ self.sourcemodule_kwargs["include_dirs"] = include_dirs
198
+ with open(fname) as fid:
199
+ cuda_src = fid.read()
200
+ self._module = SourceModule(cuda_src, **self.sourcemodule_kwargs)
201
+ # Blocks, grid
202
+ self._block_size = {1: (32, 1, 1), 2: (32, 32, 1), 3: (16, 8, 8)}[self.data_ndim] # TODO tune
203
+ self._n_blocks = tuple([int(updiv(a, b)) for a, b in zip(self.shape[::-1], self._block_size)])
204
+ # Prepare cuda kernel calls
205
+ self._cudakernel_signature = {
206
+ 1: "PPPiiii",
207
+ 2: "PPPiiiii",
208
+ 3: "PPPiiiiii",
209
+ }[self.kernel_ndim]
210
+ self.cuda_kernels = {}
211
+ for axis, kern_name in enumerate(self.use_case_kernels):
212
+ self.cuda_kernels[axis] = self._module.get_function(kern_name)
213
+ self.cuda_kernels[axis].prepare(self._cudakernel_signature)
214
+
215
+ # Cuda kernel arguments
216
+ kernel_args = [
217
+ self._n_blocks,
218
+ self._block_size,
219
+ None,
220
+ None,
221
+ self.d_kernel.gpudata,
222
+ np.int32(self.kernel.shape[0]),
223
+ self.Nx,
224
+ self.Ny,
225
+ self.Nz,
226
+ ]
227
+ if self.kernel_ndim == 2:
228
+ kernel_args.insert(5, np.int32(self.kernel.shape[1]))
229
+ if self.kernel_ndim == 3:
230
+ kernel_args.insert(5, np.int32(self.kernel.shape[2]))
231
+ kernel_args.insert(6, np.int32(self.kernel.shape[1]))
232
+ self.kernel_args = tuple(kernel_args)
233
+ # If self.data_tmp is allocated, separable transforms can be performed
234
+ # by a series of batched transforms, without any copy, by swapping refs.
235
+ self.swap_pattern = None
236
+ if self.separable:
237
+ if self.data_tmp is not None:
238
+ self.swap_pattern = {
239
+ 2: [("data_in", "data_tmp"), ("data_tmp", "data_out")],
240
+ 3: [
241
+ ("data_in", "data_out"),
242
+ ("data_out", "data_tmp"),
243
+ ("data_tmp", "data_out"),
244
+ ],
245
+ }
246
+ else:
247
+ raise NotImplementedError("For now, data_tmp has to be allocated")
248
+
249
+ def _get_swapped_arrays(self, i):
250
+ """
251
+ Get the input and output arrays to use when using a "swap pattern".
252
+ Swapping refs enables to avoid copies between temp. array and output.
253
+ For example, a separable 2D->1D convolution on 2D data reads:
254
+ data_tmp = convol(data_input, kernel, axis=1) # step i=0
255
+ data_out = convol(data_tmp, kernel, axis=0) # step i=1
256
+
257
+ :param i: current step number of the separable convolution
258
+ """
259
+ n_batchs = len(self.axes)
260
+ in_ref, out_ref = self.swap_pattern[n_batchs][i]
261
+ d_in = getattr(self, in_ref)
262
+ d_out = getattr(self, out_ref)
263
+ return d_in, d_out
264
+
265
+ def _configure_kernel_args(self, cuda_kernel_args, input_ref, output_ref):
266
+ # TODO more elegant
267
+ if isinstance(input_ref, self.cuda.array_class):
268
+ input_ref = input_ref.gpudata
269
+ if isinstance(output_ref, self.cuda.array_class):
270
+ output_ref = output_ref.gpudata
271
+ if input_ref is not None or output_ref is not None:
272
+ cuda_kernel_args = list(cuda_kernel_args)
273
+ if input_ref is not None:
274
+ cuda_kernel_args[2] = input_ref
275
+ if output_ref is not None:
276
+ cuda_kernel_args[3] = output_ref
277
+ cuda_kernel_args = tuple(cuda_kernel_args)
278
+ return cuda_kernel_args
279
+
280
+ @staticmethod
281
+ def _check_dimensions(arr=None, shape=None, name="", dim_min=1, dim_max=3):
282
+ if shape is not None:
283
+ ndim = len(shape)
284
+ elif arr is not None:
285
+ ndim = arr.ndim
286
+ else:
287
+ raise ValueError("Please provide either arr= or shape=")
288
+ if ndim < dim_min or ndim > dim_max:
289
+ raise ValueError("%s dimensions should be between %d and %d" % (name, dim_min, dim_max))
290
+ return ndim
291
+
292
+ def _check_array(self, arr):
293
+ if not (isinstance(arr, self.cuda.array_class) or isinstance(arr, np.ndarray)):
294
+ raise TypeError("Expected either pycuda.gpuarray or numpy.ndarray")
295
+ if arr.dtype != np.float32:
296
+ raise TypeError("Data must be float32")
297
+ if arr.shape != self.shape:
298
+ raise ValueError("Expected data shape = %s" % str(self.shape))
299
+
300
+ def _set_arrays(self, array, output=None):
301
+ # Either copy H->D or update references.
302
+ if isinstance(array, np.ndarray):
303
+ self.data_in[:] = array[:]
304
+ else:
305
+ self._old_input_ref = self.data_in
306
+ self.data_in = array
307
+ data_in_ref = self.data_in
308
+ if output is not None:
309
+ if not (isinstance(output, np.ndarray)):
310
+ self._old_output_ref = self.data_out
311
+ self.data_out = output
312
+ # Update Cuda kernel arguments with new array references
313
+ self.kernel_args = self._configure_kernel_args(self.kernel_args, data_in_ref, self.data_out)
314
+
315
+ def _separable_convolution(self):
316
+ assert len(self.axes) == len(self.use_case_kernels)
317
+ # Separable: one kernel call per data dimension
318
+ for i, axis in enumerate(self.axes):
319
+ in_ref, out_ref = self._get_swapped_arrays(i)
320
+ self._batched_convolution(axis, input_ref=in_ref, output_ref=out_ref)
321
+
322
+ def _batched_convolution(self, axis, input_ref=None, output_ref=None):
323
+ # Batched: one kernel call in total
324
+ cuda_kernel = self.cuda_kernels[axis]
325
+ cuda_kernel_args = self._configure_kernel_args(self.kernel_args, input_ref, output_ref)
326
+ ev = cuda_kernel.prepared_call(*cuda_kernel_args)
327
+
328
+ def _nd_convolution(self):
329
+ assert len(self.use_case_kernels) == 1
330
+ cuda_kernel = self._module.get_function(self.use_case_kernels[0])
331
+ ev = cuda_kernel.prepared_call(*self.kernel_args)
332
+
333
+ def _recover_arrays_references(self):
334
+ if self._old_input_ref is not None:
335
+ self.data_in = self._old_input_ref
336
+ self._old_input_ref = None
337
+ if self._old_output_ref is not None:
338
+ self.data_out = self._old_output_ref
339
+ self._old_output_ref = None
340
+ self.kernel_args = self._configure_kernel_args(self.kernel_args, self.data_in, self.data_out)
341
+
342
+ def _get_output(self, output):
343
+ if output is None:
344
+ res = self.data_out.get()
345
+ else:
346
+ res = output
347
+ if isinstance(output, np.ndarray):
348
+ output[:] = self.data_out[:]
349
+ self._recover_arrays_references()
350
+ return res
351
+
352
+ def convolve(self, array, output=None):
353
+ """
354
+ Convolve an array with the class kernel.
355
+
356
+ :param array: Input array. Can be numpy.ndarray or pycuda.gpuarray.GPUArray.
357
+ :param output: Output array. Can be numpy.ndarray or pycuda.gpuarray.GPUArray.
358
+ """
359
+ self._check_array(array)
360
+ self._set_arrays(array, output=output)
361
+ if self.axes is not None:
362
+ if self.separable:
363
+ self._separable_convolution()
364
+ elif self.batched:
365
+ assert len(self.axes) == 1
366
+ self._batched_convolution(self.axes[0])
367
+ # else: ND-ND convol
368
+ else:
369
+ # ND-ND convol
370
+ self._nd_convolution()
371
+
372
+ res = self._get_output(output)
373
+ return res
374
+
375
+ __call__ = convolve