PyPI - nabu - Versions diffs - 2023.2.1__py3-none-any.whl → 2024.1.0rc3__py3-none-any.whl - Mend

nabu 2023.2.1py3-none-any.whl → 2024.1.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (183) hide show

doc/conf.py +1 -1
doc/doc_config.py +32 -0
nabu/__init__.py +2 -1
nabu/app/bootstrap_stitching.py +1 -1
nabu/app/cli_configs.py +122 -2
nabu/app/composite_cor.py +27 -2
nabu/app/correct_rot.py +70 -0
nabu/app/create_distortion_map_from_poly.py +42 -18
nabu/app/diag_to_pix.py +358 -0
nabu/app/diag_to_rot.py +449 -0
nabu/app/generate_header.py +4 -3
nabu/app/histogram.py +2 -2
nabu/app/multicor.py +6 -1
nabu/app/parse_reconstruction_log.py +151 -0
nabu/app/prepare_weights_double.py +83 -22
nabu/app/reconstruct.py +5 -1
nabu/app/reconstruct_helical.py +7 -0
nabu/app/reduce_dark_flat.py +6 -3
nabu/app/rotate.py +4 -4
nabu/app/stitching.py +16 -2
nabu/app/tests/test_reduce_dark_flat.py +18 -2
nabu/app/validator.py +4 -4
nabu/cuda/convolution.py +8 -376
nabu/cuda/fft.py +4 -0
nabu/cuda/kernel.py +4 -4
nabu/cuda/medfilt.py +5 -158
nabu/cuda/padding.py +5 -71
nabu/cuda/processing.py +23 -2
nabu/cuda/src/ElementOp.cu +78 -0
nabu/cuda/src/backproj.cu +28 -2
nabu/cuda/src/fourier_wavelets.cu +2 -2
nabu/cuda/src/normalization.cu +23 -0
nabu/cuda/src/padding.cu +2 -2
nabu/cuda/src/transpose.cu +16 -0
nabu/cuda/utils.py +39 -0
nabu/estimation/alignment.py +10 -1
nabu/estimation/cor.py +808 -38
nabu/estimation/cor_sino.py +7 -9
nabu/estimation/tests/test_cor.py +85 -3
nabu/io/reader.py +26 -18
nabu/io/tests/test_cast_volume.py +3 -3
nabu/io/tests/test_detector_distortion.py +3 -3
nabu/io/tiffwriter_zmm.py +2 -2
nabu/io/utils.py +14 -4
nabu/io/writer.py +5 -3
nabu/misc/fftshift.py +6 -0
nabu/misc/histogram.py +5 -285
nabu/misc/histogram_cuda.py +8 -104
nabu/misc/kernel_base.py +3 -121
nabu/misc/padding_base.py +5 -69
nabu/misc/processing_base.py +3 -107
nabu/misc/rotation.py +5 -62
nabu/misc/rotation_cuda.py +5 -65
nabu/misc/transpose.py +6 -0
nabu/misc/unsharp.py +3 -78
nabu/misc/unsharp_cuda.py +5 -52
nabu/misc/unsharp_opencl.py +8 -85
nabu/opencl/fft.py +6 -0
nabu/opencl/kernel.py +21 -6
nabu/opencl/padding.py +5 -72
nabu/opencl/processing.py +27 -5
nabu/opencl/src/backproj.cl +3 -3
nabu/opencl/src/fftshift.cl +65 -12
nabu/opencl/src/padding.cl +2 -2
nabu/opencl/src/roll.cl +96 -0
nabu/opencl/src/transpose.cl +16 -0
nabu/pipeline/config_validators.py +63 -3
nabu/pipeline/dataset_validator.py +2 -2
nabu/pipeline/estimators.py +193 -35
nabu/pipeline/fullfield/chunked.py +34 -17
nabu/pipeline/fullfield/chunked_cuda.py +7 -5
nabu/pipeline/fullfield/computations.py +48 -13
nabu/pipeline/fullfield/nabu_config.py +13 -13
nabu/pipeline/fullfield/processconfig.py +10 -5
nabu/pipeline/fullfield/reconstruction.py +1 -2
nabu/pipeline/helical/fbp.py +5 -0
nabu/pipeline/helical/filtering.py +12 -9
nabu/pipeline/helical/gridded_accumulator.py +179 -33
nabu/pipeline/helical/helical_chunked_regridded.py +262 -151
nabu/pipeline/helical/helical_chunked_regridded_cuda.py +4 -11
nabu/pipeline/helical/helical_reconstruction.py +56 -18
nabu/pipeline/helical/span_strategy.py +1 -1
nabu/pipeline/helical/tests/test_accumulator.py +4 -0
nabu/pipeline/params.py +23 -2
nabu/pipeline/processconfig.py +3 -8
nabu/pipeline/tests/test_chunk_reader.py +78 -0
nabu/pipeline/tests/test_estimators.py +120 -2
nabu/pipeline/utils.py +25 -0
nabu/pipeline/writer.py +2 -0
nabu/preproc/ccd_cuda.py +9 -7
nabu/preproc/ctf.py +21 -26
nabu/preproc/ctf_cuda.py +25 -25
nabu/preproc/double_flatfield.py +14 -2
nabu/preproc/double_flatfield_cuda.py +7 -11
nabu/preproc/flatfield_cuda.py +23 -27
nabu/preproc/phase.py +19 -24
nabu/preproc/phase_cuda.py +21 -21
nabu/preproc/shift_cuda.py +58 -28
nabu/preproc/tests/test_ctf.py +5 -5
nabu/preproc/tests/test_double_flatfield.py +2 -2
nabu/preproc/tests/test_vshift.py +13 -2
nabu/processing/__init__.py +0 -0
nabu/processing/convolution_cuda.py +375 -0
nabu/processing/fft_base.py +163 -0
nabu/processing/fft_cuda.py +256 -0
nabu/processing/fft_opencl.py +54 -0
nabu/processing/fftshift.py +134 -0
nabu/processing/histogram.py +286 -0
nabu/processing/histogram_cuda.py +103 -0
nabu/processing/kernel_base.py +126 -0
nabu/processing/medfilt_cuda.py +159 -0
nabu/processing/muladd.py +29 -0
nabu/processing/muladd_cuda.py +68 -0
nabu/processing/padding_base.py +71 -0
nabu/processing/padding_cuda.py +75 -0
nabu/processing/padding_opencl.py +77 -0
nabu/processing/processing_base.py +123 -0
nabu/processing/roll_opencl.py +64 -0
nabu/processing/rotation.py +63 -0
nabu/processing/rotation_cuda.py +66 -0
nabu/processing/tests/__init__.py +0 -0
nabu/processing/tests/test_fft.py +268 -0
nabu/processing/tests/test_fftshift.py +71 -0
nabu/{misc → processing}/tests/test_histogram.py +2 -4
nabu/{cuda → processing}/tests/test_medfilt.py +1 -1
nabu/processing/tests/test_muladd.py +54 -0
nabu/{cuda → processing}/tests/test_padding.py +119 -75
nabu/processing/tests/test_roll.py +63 -0
nabu/{misc → processing}/tests/test_rotation.py +3 -2
nabu/processing/tests/test_transpose.py +72 -0
nabu/{misc → processing}/tests/test_unsharp.py +41 -8
nabu/processing/transpose.py +126 -0
nabu/processing/unsharp.py +79 -0
nabu/processing/unsharp_cuda.py +53 -0
nabu/processing/unsharp_opencl.py +75 -0
nabu/reconstruction/fbp.py +34 -10
nabu/reconstruction/fbp_base.py +35 -16
nabu/reconstruction/fbp_opencl.py +7 -12
nabu/reconstruction/filtering.py +2 -2
nabu/reconstruction/filtering_cuda.py +13 -14
nabu/reconstruction/filtering_opencl.py +3 -4
nabu/reconstruction/projection.py +2 -0
nabu/reconstruction/rings.py +158 -1
nabu/reconstruction/rings_cuda.py +218 -58
nabu/reconstruction/sinogram_cuda.py +16 -12
nabu/reconstruction/tests/test_deringer.py +116 -14
nabu/reconstruction/tests/test_fbp.py +22 -31
nabu/reconstruction/tests/test_filtering.py +11 -2
nabu/resources/dataset_analyzer.py +89 -26
nabu/resources/nxflatfield.py +2 -2
nabu/resources/tests/test_nxflatfield.py +1 -1
nabu/resources/utils.py +9 -2
nabu/stitching/alignment.py +184 -0
nabu/stitching/config.py +241 -39
nabu/stitching/definitions.py +6 -0
nabu/stitching/frame_composition.py +4 -2
nabu/stitching/overlap.py +99 -3
nabu/stitching/sample_normalization.py +60 -0
nabu/stitching/slurm_utils.py +10 -10
nabu/stitching/tests/test_alignment.py +99 -0
nabu/stitching/tests/test_config.py +16 -1
nabu/stitching/tests/test_overlap.py +68 -2
nabu/stitching/tests/test_sample_normalization.py +49 -0
nabu/stitching/tests/test_slurm_utils.py +5 -5
nabu/stitching/tests/test_utils.py +3 -33
nabu/stitching/tests/test_z_stitching.py +391 -22
nabu/stitching/utils.py +144 -202
nabu/stitching/z_stitching.py +309 -126
nabu/testutils.py +18 -0
nabu/thirdparty/tomocupy_remove_stripe.py +586 -0
nabu/utils.py +32 -6
{nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/LICENSE +1 -1
{nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/METADATA +5 -5
nabu-2024.1.0rc3.dist-info/RECORD +296 -0
{nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/WHEEL +1 -1
{nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/entry_points.txt +5 -1
nabu/conftest.py +0 -14
nabu/opencl/fftshift.py +0 -92
nabu/opencl/tests/test_fftshift.py +0 -55
nabu/opencl/tests/test_padding.py +0 -84
nabu-2023.2.1.dist-info/RECORD +0 -252
/nabu/cuda/src/{fftshift.cu → dfi_fftshift.cu} +0 -0
{nabu-2023.2.1.dist-info → nabu-2024.1.0rc3.dist-info}/top_level.txt +0 -0

nabu/preproc/shift_cuda.py CHANGED Viewed

@@ -1,24 +1,25 @@
 import numpy as np
-from math import floor
-from .shift import VerticalShift
 from ..cuda.utils import __has_pycuda__
-if __has_pycuda__:
-    import pycuda.gpuarray as garray
+from ..cuda.processing import CudaProcessing
+from ..processing.muladd_cuda import CudaMulAdd
+from .shift import VerticalShift
 class CudaVerticalShift(VerticalShift):
-    def __init__(self, radios_shape, shifts):
+    def __init__(self, radios_shape, shifts, **cuda_options):
         """
         Vertical Shifter, Cuda backend.
         """
         super().__init__(radios_shape, shifts)
+        self.cuda_processing = CudaProcessing(**(cuda_options or {}))
         self._init_cuda_arrays()
     def _init_cuda_arrays(self):
         interp_infos_arr = np.zeros((len(self.interp_infos), 2), "f")
-        self._d_interp_infos = garray.to_gpu(interp_infos_arr)
-        self._d_radio_tmp = garray.zeros(self.radios_shape[1:], "f")
+        self._d_interp_infos = self.cuda_processing.to_device("_d_interp_infos", interp_infos_arr)
+        self._d_radio_new = self.cuda_processing.allocate_array("_d_radio_new", self.radios_shape[1:], "f")
+        self._d_radio = self.cuda_processing.allocate_array("_d_radio", self.radios_shape[1:], "f")
+        self.muladd_kernel = CudaMulAdd(ctx=self.cuda_processing.ctx)
     def apply_vertical_shifts(self, radios, iangles, output=None):
         """
@@ -35,38 +36,67 @@ class CudaVerticalShift(VerticalShift):
             Must be of the same shape of `radios`.
         """
         self._check(radios, iangles)
-        n_z = self.radios_shape[1]
+        n_a, n_z, n_x = radios.shape
+        assert n_z == self.radios_shape[1]
+        x_slice = slice(0, n_x)  # slice(None, None)
+        def nonempty_subregion(region):
+            if region is None:
+                return True
+            z_slice = region[0]
+            return z_slice.stop - z_slice.start > 0
+        d_radio_new = self._d_radio_new
+        d_radio = self._d_radio
         for ia in iangles:
-            radio = radios[ia]
-            self._d_radio_tmp.fill(0)
+            d_radio_new.fill(0)
+            d_radio[:] = radios[ia, :, :]  # mul-add kernel won't work with pycuda view
             S0, f = self.interp_infos[ia]
-            s0 = S0
+            f = np.float32(f)
+            s0 = S0
             if s0 > 0:
-                self._d_radio_tmp[:-s0] = radio[s0:]
-                self._d_radio_tmp[:-s0] *= 1 - f
+                # newradio[:-s0] = radio[s0:] * (1 - f)
+                dst_region = (slice(0, n_z - s0), x_slice)
+                other_region = (slice(s0, n_z), x_slice)
             elif s0 == 0:
-                self._d_radio_tmp[:] = radio[s0:]
-                self._d_radio_tmp[:] *= 1 - f
+                # newradio[:] = radio[s0:] * (1 - f)
+                dst_region = None
+                other_region = (slice(s0, n_z), x_slice)
             else:
-                self._d_radio_tmp[-s0:] = radio[:s0]
-                self._d_radio_tmp[-s0:] *= 1 - f
+                # newradio[-s0:] = radio[:s0] * (1 - f)
+                dst_region = (slice(-s0, n_z), x_slice)
+                other_region = (slice(0, n_z + s0), x_slice)
-            s0 = S0 + 1
-            f = np.float32(f)
+            if all([nonempty_subregion(reg) for reg in [dst_region, other_region]]):
+                self.muladd_kernel(
+                    d_radio_new,
+                    d_radio,
+                    1,
+                    1 - f,
+                    dst_region=dst_region,
+                    other_region=other_region,
+                )
-            #  "radios[] * f"  is out of place but 2D
+            s0 = S0 + 1
             if s0 > 0:
-                if s0 < n_z:
-                    self._d_radio_tmp[:-s0] += radio[s0:] * f
+                # newradio[:-s0] += radio[s0:] * f
+                dst_region = (slice(0, n_z - s0), x_slice)
+                other_region = (slice(s0, n_z), x_slice)
             elif s0 == 0:
-                self._d_radio_tmp[:] += radio[s0:] * f
+                # newradio[:] += radio[s0:] * f
+                dst_region = None
+                other_region = (slice(s0, n_z), x_slice)
             else:
-                self._d_radio_tmp[-s0:] += radio[:s0] * f
+                # newradio[-s0:] += radio[:s0] * f
+                dst_region = (slice(-s0, n_z), x_slice)
+                other_region = (slice(0, n_z + s0), x_slice)
+            if all([nonempty_subregion(reg) for reg in [dst_region, other_region]]):
+                self.muladd_kernel(d_radio_new, d_radio, 1, f, dst_region=dst_region, other_region=other_region)
             if output is None:
-                radios[ia, :, :] = self._d_radio_tmp[:]
+                radios[ia, :, :] = d_radio_new[:, :]
             else:
-                output[ia, :, :] = self._d_radio_tmp[:]
+                output[ia, :, :] = d_radio_new[:, :]

nabu/preproc/tests/test_ctf.py CHANGED Viewed

@@ -213,11 +213,11 @@ class TestCtf:
         phase_r2c = ctf_numpy.retrieve_phase(img)
         self.check_result(phase_r2c, self.ref_plain, "Something wrong with CtfFilter-R2C")
-        # Test FFTW
-        ctf_fftw = ctf.CtfFilter(*ctf_args, **ctf_kwargs, use_rfft=True, fftw_num_threads=-1)
-        if ctf_fftw.use_rfft:
-            phase_fftw = ctf_fftw.retrieve_phase(img)
-            self.check_result(phase_r2c, self.ref_plain, "Something wrong with CtfFilter-FFTW")
+        # Test multi-core FFT
+        ctf_fft = ctf.CtfFilter(*ctf_args, **ctf_kwargs, use_rfft=True, fft_num_threads=0)
+        if ctf_fft.use_rfft:
+            phase_fft = ctf_fft.retrieve_phase(img)
+            self.check_result(phase_r2c, self.ref_plain, "Something wrong with CtfFilter-FFT")
     @pytest.mark.skipif(not (__has_pycuda__ and __has_cufft__), reason="pycuda and scikit-cuda")
     def test_cuda_ctf(self):

nabu/preproc/tests/test_double_flatfield.py CHANGED Viewed

@@ -4,7 +4,7 @@ import tempfile
 import numpy as np
 import pytest
 from silx.io.url import DataUrl
-from tomoscan.esrf.mock import MockHDF5
+from tomoscan.esrf.mock import MockNXtomo
 from nabu.io.reader import HDF5Reader
 from nabu.preproc.double_flatfield import DoubleFlatField
 from nabu.cuda.utils import __has_pycuda__, get_cuda_context
@@ -20,7 +20,7 @@ def bootstrap(request):
     cls.tmpdir = tempfile.TemporaryDirectory()
     dname = cls.tmpdir.name
     cls.dname = dname
-    radios = MockHDF5(
+    radios = MockNXtomo(
         path.join(dname, "tmp"),
         10,
         n_ini_proj=10,

nabu/preproc/tests/test_vshift.py CHANGED Viewed

@@ -5,7 +5,8 @@ from nabu.preproc.shift import VerticalShift
 from nabu.cuda.utils import __has_pycuda__, get_cuda_context
 if __has_pycuda__:
-    from nabu.preproc.shift_cuda import CudaVerticalShift, garray
+    import pycuda.gpuarray as garray
+    from nabu.preproc.shift_cuda import CudaVerticalShift
 @pytest.fixture(scope="class")
@@ -51,12 +52,22 @@ class TestVerticalShift:
     @pytest.mark.skipif(not (__has_pycuda__), reason="Need cuda/pycuda for this test")
     def test_cuda_vshift(self):
         d_radios = garray.to_gpu(self.radios)
+        d_radios2 = d_radios.copy()
         d_out = garray.zeros_like(d_radios)
         Shifter = CudaVerticalShift(d_radios.shape, self.shifts)
         Shifter.apply_vertical_shifts(d_radios, self.indexes, output=d_out)
         assert abs(d_out.get() - self.golden).max() < self.tol
         Shifter.apply_vertical_shifts(d_radios, self.indexes)
         assert abs(d_radios.get() - self.golden).max() < self.tol
+        # Test with negative shifts
+        radios2 = self.radios.copy()
+        Shifter_neg = VerticalShift(self.radios.shape, -self.shifts)
+        Shifter_neg.apply_vertical_shifts(radios2, self.indexes)
+        Shifter_neg_cuda = CudaVerticalShift(d_radios.shape, -self.shifts)
+        Shifter_neg_cuda.apply_vertical_shifts(d_radios2, self.indexes)
+        err_max = np.max(np.abs(d_radios2.get() - radios2))
+        assert err_max < 1e-6, "Something wrong for negative translations: max error = %.2e" % err_max

nabu/processing/__init__.py ADDED Viewed

File without changes

nabu/processing/convolution_cuda.py ADDED Viewed

@@ -0,0 +1,375 @@
+from os.path import dirname
+import numpy as np
+from ..utils import updiv, get_cuda_srcfile
+from ..cuda.utils import __has_pycuda__
+from ..misc.utils import ConvolutionInfos
+from ..cuda.processing import CudaProcessing
+if __has_pycuda__:
+    from pycuda.compiler import SourceModule
+class Convolution:
+    """
+    A class for performing convolution on GPU with CUDA, but without using
+    textures (unlike for example in ``silx.opencl.convolution``)
+    """
+    def __init__(self, shape, kernel, axes=None, mode=None, extra_options=None, cuda_options=None):
+        """
+        Constructor of Cuda Convolution.
+        Parameters
+        -----------
+        shape: tuple
+            Shape of the array.
+        kernel: array-like
+            Convolution kernel (1D, 2D or 3D).
+        axes: tuple, optional
+            Axes along which the convolution is performed,
+            for batched convolutions.
+        mode: str, optional
+            Boundary handling mode. Available modes are:
+               - "reflect": cba|abcd|dcb
+               - "nearest": aaa|abcd|ddd
+               - "wrap": bcd|abcd|abc
+               - "constant": 000|abcd|000
+            Default is "reflect".
+        extra_options: dict, optional
+            Advanced options (dict). Current options are:
+               - "allocate_input_array": True
+               - "allocate_output_array": True
+               - "allocate_tmp_array": True
+               - "sourcemodule_kwargs": {}
+               - "batch_along_flat_dims": True
+        """
+        self.cuda = CudaProcessing(**(cuda_options or {}))
+        self._configure_extra_options(extra_options)
+        self._determine_use_case(shape, kernel, axes)
+        self._allocate_memory(mode)
+        self._init_kernels()
+    def _configure_extra_options(self, extra_options):
+        self.extra_options = {
+            "allocate_input_array": True,
+            "allocate_output_array": True,
+            "allocate_tmp_array": True,
+            "sourcemodule_kwargs": {},
+            "batch_along_flat_dims": True,
+        }
+        extra_opts = extra_options or {}
+        self.extra_options.update(extra_opts)
+        self.sourcemodule_kwargs = self.extra_options["sourcemodule_kwargs"]
+    def _get_dimensions(self, shape, kernel):
+        self.shape = shape
+        self.data_ndim = self._check_dimensions(shape=shape, name="Data")
+        self.kernel_ndim = self._check_dimensions(arr=kernel, name="Kernel")
+        Nx = shape[-1]
+        if self.data_ndim >= 2:
+            Ny = shape[-2]
+        else:
+            Ny = 1
+        if self.data_ndim >= 3:
+            Nz = shape[-3]
+        else:
+            Nz = 1
+        self.Nx = np.int32(Nx)
+        self.Ny = np.int32(Ny)
+        self.Nz = np.int32(Nz)
+    def _determine_use_case(self, shape, kernel, axes):
+        """
+        Determine the convolution use case from the input/kernel shape, and axes.
+        """
+        self._get_dimensions(shape, kernel)
+        if self.kernel_ndim > self.data_ndim:
+            raise ValueError("Kernel dimensions cannot exceed data dimensions")
+        data_ndim = self.data_ndim
+        kernel_ndim = self.kernel_ndim
+        self.kernel = kernel.astype("f")
+        convol_infos = ConvolutionInfos()
+        k = (data_ndim, kernel_ndim)
+        if k not in convol_infos.use_cases:
+            raise ValueError(
+                "Cannot find a use case for data ndim = %d and kernel ndim = %d" % (data_ndim, kernel_ndim)
+            )
+        possible_use_cases = convol_infos.use_cases[k]
+        # If some dimensions are "flat", make a batched convolution along them
+        # Ex. data_dim = (1, Nx) -> batched 1D convolution
+        if self.extra_options["batch_along_flat_dims"] and (1 in self.shape):
+            axes = tuple([curr_dim for numels, curr_dim in zip(self.shape, range(len(self.shape))) if numels != 1])
+        #
+        self.use_case_name = None
+        for uc_name, uc_params in possible_use_cases.items():
+            if axes in convol_infos.allowed_axes[uc_name]:
+                self.use_case_name = uc_name
+                self.use_case_desc = uc_params["name"]
+                self.use_case_kernels = uc_params["kernels"].copy()
+        if self.use_case_name is None:
+            raise ValueError(
+                "Cannot find a use case for data ndim = %d, kernel ndim = %d and axes=%s"
+                % (data_ndim, kernel_ndim, str(axes))
+            )
+        # TODO implement this use case
+        if self.use_case_name == "batched_separable_2D_1D_3D":
+            raise NotImplementedError("The use case %s is not implemented" % self.use_case_name)
+        #
+        self.axes = axes
+        # Replace "axes=None" with an actual value (except for ND-ND)
+        allowed_axes = convol_infos.allowed_axes[self.use_case_name]
+        if len(allowed_axes) > 1:
+            # The default choice might impact perfs
+            self.axes = allowed_axes[0] or allowed_axes[1]
+        self.separable = self.use_case_name.startswith("separable")
+        self.batched = self.use_case_name.startswith("batched")
+    def _allocate_memory(self, mode):
+        self.mode = mode or "reflect"
+        # The current implementation does not support kernel size bigger than data size,
+        # except for mode="nearest"
+        for i, dim_size in enumerate(self.shape):
+            if min(self.kernel.shape) > dim_size and i in self.axes:
+                print(
+                    "Warning: kernel support is too large for data dimension %d (%d). Forcing convolution mode to 'nearest'"
+                    % (i, dim_size)
+                )
+                self.mode = "nearest"
+        #
+        option_array_names = {
+            "allocate_input_array": "data_in",
+            "allocate_output_array": "data_out",
+            "allocate_tmp_array": "data_tmp",
+        }
+        # Nonseparable transforms do not need tmp array
+        if not (self.separable):
+            self.extra_options["allocate_tmp_array"] = False
+        # Allocate arrays
+        for option_name, array_name in option_array_names.items():
+            if self.extra_options[option_name]:
+                value = self.cuda.allocate_array("value", self.shape, np.float32)
+            else:
+                value = None
+            setattr(self, array_name, value)
+        if isinstance(self.kernel, np.ndarray):
+            self.d_kernel = self.cuda.to_device("d_kernel", self.kernel)
+        else:
+            if not (isinstance(self.kernel, self.cuda.array_class)):
+                raise ValueError("kernel must be either numpy array or pycuda array")
+            self.d_kernel = self.kernel
+        self._old_input_ref = None
+        self._old_output_ref = None
+        self._c_modes_mapping = {
+            "periodic": 2,
+            "wrap": 2,
+            "nearest": 1,
+            "replicate": 1,
+            "reflect": 0,
+            "constant": 3,
+        }
+        mp = self._c_modes_mapping
+        if self.mode.lower() not in mp:
+            raise ValueError(
+                """
+                Mode %s is not available. Available modes are:
+                %s
+                """
+                % (self.mode, str(mp.keys()))
+            )
+        if self.mode.lower() == "constant":
+            raise NotImplementedError("mode='constant' is not implemented yet")
+        self._c_conv_mode = mp[self.mode]
+    def _init_kernels(self):
+        if self.kernel_ndim > 1:
+            if np.abs(np.diff(self.kernel.shape)).max() > 0:
+                raise NotImplementedError("Non-separable convolution with non-square kernels is not implemented yet")
+        # Compile source module
+        compile_options = [str("-DUSED_CONV_MODE=%d" % self._c_conv_mode)]
+        fname = get_cuda_srcfile("convolution.cu")
+        nabu_cuda_dir = dirname(fname)
+        include_dirs = [nabu_cuda_dir]
+        self.sourcemodule_kwargs["options"] = compile_options
+        self.sourcemodule_kwargs["include_dirs"] = include_dirs
+        with open(fname) as fid:
+            cuda_src = fid.read()
+        self._module = SourceModule(cuda_src, **self.sourcemodule_kwargs)
+        # Blocks, grid
+        self._block_size = {1: (32, 1, 1), 2: (32, 32, 1), 3: (16, 8, 8)}[self.data_ndim]  # TODO tune
+        self._n_blocks = tuple([int(updiv(a, b)) for a, b in zip(self.shape[::-1], self._block_size)])
+        # Prepare cuda kernel calls
+        self._cudakernel_signature = {
+            1: "PPPiiii",
+            2: "PPPiiiii",
+            3: "PPPiiiiii",
+        }[self.kernel_ndim]
+        self.cuda_kernels = {}
+        for axis, kern_name in enumerate(self.use_case_kernels):
+            self.cuda_kernels[axis] = self._module.get_function(kern_name)
+            self.cuda_kernels[axis].prepare(self._cudakernel_signature)
+        # Cuda kernel arguments
+        kernel_args = [
+            self._n_blocks,
+            self._block_size,
+            None,
+            None,
+            self.d_kernel.gpudata,
+            np.int32(self.kernel.shape[0]),
+            self.Nx,
+            self.Ny,
+            self.Nz,
+        ]
+        if self.kernel_ndim == 2:
+            kernel_args.insert(5, np.int32(self.kernel.shape[1]))
+        if self.kernel_ndim == 3:
+            kernel_args.insert(5, np.int32(self.kernel.shape[2]))
+            kernel_args.insert(6, np.int32(self.kernel.shape[1]))
+        self.kernel_args = tuple(kernel_args)
+        # If self.data_tmp is allocated, separable transforms can be performed
+        # by a series of batched transforms, without any copy, by swapping refs.
+        self.swap_pattern = None
+        if self.separable:
+            if self.data_tmp is not None:
+                self.swap_pattern = {
+                    2: [("data_in", "data_tmp"), ("data_tmp", "data_out")],
+                    3: [
+                        ("data_in", "data_out"),
+                        ("data_out", "data_tmp"),
+                        ("data_tmp", "data_out"),
+                    ],
+                }
+            else:
+                raise NotImplementedError("For now, data_tmp has to be allocated")
+    def _get_swapped_arrays(self, i):
+        """
+        Get the input and output arrays to use when using a "swap pattern".
+        Swapping refs enables to avoid copies between temp. array and output.
+        For example, a separable 2D->1D convolution on 2D data reads:
+          data_tmp = convol(data_input, kernel, axis=1) # step i=0
+          data_out = convol(data_tmp, kernel, axis=0) # step i=1
+        :param i: current step number of the separable convolution
+        """
+        n_batchs = len(self.axes)
+        in_ref, out_ref = self.swap_pattern[n_batchs][i]
+        d_in = getattr(self, in_ref)
+        d_out = getattr(self, out_ref)
+        return d_in, d_out
+    def _configure_kernel_args(self, cuda_kernel_args, input_ref, output_ref):
+        # TODO more elegant
+        if isinstance(input_ref, self.cuda.array_class):
+            input_ref = input_ref.gpudata
+        if isinstance(output_ref, self.cuda.array_class):
+            output_ref = output_ref.gpudata
+        if input_ref is not None or output_ref is not None:
+            cuda_kernel_args = list(cuda_kernel_args)
+            if input_ref is not None:
+                cuda_kernel_args[2] = input_ref
+            if output_ref is not None:
+                cuda_kernel_args[3] = output_ref
+            cuda_kernel_args = tuple(cuda_kernel_args)
+        return cuda_kernel_args
+    @staticmethod
+    def _check_dimensions(arr=None, shape=None, name="", dim_min=1, dim_max=3):
+        if shape is not None:
+            ndim = len(shape)
+        elif arr is not None:
+            ndim = arr.ndim
+        else:
+            raise ValueError("Please provide either arr= or shape=")
+        if ndim < dim_min or ndim > dim_max:
+            raise ValueError("%s dimensions should be between %d and %d" % (name, dim_min, dim_max))
+        return ndim
+    def _check_array(self, arr):
+        if not (isinstance(arr, self.cuda.array_class) or isinstance(arr, np.ndarray)):
+            raise TypeError("Expected either pycuda.gpuarray or numpy.ndarray")
+        if arr.dtype != np.float32:
+            raise TypeError("Data must be float32")
+        if arr.shape != self.shape:
+            raise ValueError("Expected data shape = %s" % str(self.shape))
+    def _set_arrays(self, array, output=None):
+        # Either copy H->D or update references.
+        if isinstance(array, np.ndarray):
+            self.data_in[:] = array[:]
+        else:
+            self._old_input_ref = self.data_in
+            self.data_in = array
+        data_in_ref = self.data_in
+        if output is not None:
+            if not (isinstance(output, np.ndarray)):
+                self._old_output_ref = self.data_out
+                self.data_out = output
+        # Update Cuda kernel arguments with new array references
+        self.kernel_args = self._configure_kernel_args(self.kernel_args, data_in_ref, self.data_out)
+    def _separable_convolution(self):
+        assert len(self.axes) == len(self.use_case_kernels)
+        # Separable: one kernel call per data dimension
+        for i, axis in enumerate(self.axes):
+            in_ref, out_ref = self._get_swapped_arrays(i)
+            self._batched_convolution(axis, input_ref=in_ref, output_ref=out_ref)
+    def _batched_convolution(self, axis, input_ref=None, output_ref=None):
+        # Batched: one kernel call in total
+        cuda_kernel = self.cuda_kernels[axis]
+        cuda_kernel_args = self._configure_kernel_args(self.kernel_args, input_ref, output_ref)
+        ev = cuda_kernel.prepared_call(*cuda_kernel_args)
+    def _nd_convolution(self):
+        assert len(self.use_case_kernels) == 1
+        cuda_kernel = self._module.get_function(self.use_case_kernels[0])
+        ev = cuda_kernel.prepared_call(*self.kernel_args)
+    def _recover_arrays_references(self):
+        if self._old_input_ref is not None:
+            self.data_in = self._old_input_ref
+            self._old_input_ref = None
+        if self._old_output_ref is not None:
+            self.data_out = self._old_output_ref
+            self._old_output_ref = None
+        self.kernel_args = self._configure_kernel_args(self.kernel_args, self.data_in, self.data_out)
+    def _get_output(self, output):
+        if output is None:
+            res = self.data_out.get()
+        else:
+            res = output
+            if isinstance(output, np.ndarray):
+                output[:] = self.data_out[:]
+        self._recover_arrays_references()
+        return res
+    def convolve(self, array, output=None):
+        """
+        Convolve an array with the class kernel.
+        :param array: Input array. Can be numpy.ndarray or pycuda.gpuarray.GPUArray.
+        :param output: Output array. Can be numpy.ndarray or pycuda.gpuarray.GPUArray.
+        """
+        self._check_array(array)
+        self._set_arrays(array, output=output)
+        if self.axes is not None:
+            if self.separable:
+                self._separable_convolution()
+            elif self.batched:
+                assert len(self.axes) == 1
+                self._batched_convolution(self.axes[0])
+            # else: ND-ND convol
+        else:
+            # ND-ND convol
+            self._nd_convolution()
+        res = self._get_output(output)
+        return res
+    __call__ = convolve

nabu 2023.2.1__py3-none-any.whl → 2024.1.0rc3__py3-none-any.whl

nabu 2023.2.1py3-none-any.whl → 2024.1.0rc3py3-none-any.whl