PyPI - cuslines - Versions diffs - 2.0.0__py3-none-any.whl - Mend

cuslines 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

cuslines/__init__.py +13 -0
cuslines/cuda_c/boot.cu +1066 -0
cuslines/cuda_c/cudamacro.h +86 -0
cuslines/cuda_c/cuwsort.cuh +171 -0
cuslines/cuda_c/disc.h +1886 -0
cuslines/cuda_c/generate_streamlines_cuda.cu +695 -0
cuslines/cuda_c/globals.h +103 -0
cuslines/cuda_c/ptt.cu +559 -0
cuslines/cuda_c/ptt.cuh +47 -0
cuslines/cuda_c/tracking_helpers.cu +290 -0
cuslines/cuda_c/utils.cu +138 -0
cuslines/cuda_python/__init__.py +13 -0
cuslines/cuda_python/_globals.py +10 -0
cuslines/cuda_python/cu_direction_getters.py +472 -0
cuslines/cuda_python/cu_propagate_seeds.py +259 -0
cuslines/cuda_python/cu_tractography.py +315 -0
cuslines/cuda_python/cutils.py +64 -0
cuslines-2.0.0.dist-info/METADATA +90 -0
cuslines-2.0.0.dist-info/RECORD +22 -0
cuslines-2.0.0.dist-info/WHEEL +5 -0
cuslines-2.0.0.dist-info/licenses/LICENSE +26 -0
cuslines-2.0.0.dist-info/top_level.txt +1 -0

cuslines/cuda_python/cu_propagate_seeds.py ADDED Viewed

@@ -0,0 +1,259 @@
+import numpy as np
+import gc
+from cuda.bindings import runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
+import logging
+from cuslines.cuda_python.cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    REAL3_DTYPE,
+    MAX_SLINE_LEN,
+    EXCESS_ALLOC_FACT,
+    THR_X_SL,
+    THR_X_BL,
+    DEV_PTR,
+    div_up,
+    checkCudaErrors,
+)
+logger = logging.getLogger("GPUStreamlines")
+class SeedBatchPropagator:
+    def __init__(self, gpu_tracker):
+        self.gpu_tracker = gpu_tracker
+        self.ngpus = gpu_tracker.ngpus
+        self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32)
+        self.nSlines = np.zeros(self.ngpus, dtype=np.int32)
+        self.slines = np.zeros(self.ngpus, dtype=np.ndarray)
+        self.sline_lens = np.zeros(self.ngpus, dtype=np.ndarray)
+        self.seeds_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slineSeed_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slinesOffs_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.shDirTemp0_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slineLen_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.sline_d = np.empty(self.ngpus, dtype=DEV_PTR)
+    def _switch_device(self, n):
+        checkCudaErrors(runtime.cudaSetDevice(n))
+        nseeds_gpu = min(
+            self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu)
+        )
+        block = (THR_X_SL, THR_X_BL // THR_X_SL, 1)
+        grid = (div_up(nseeds_gpu, THR_X_BL // THR_X_SL), 1, 1)
+        return nseeds_gpu, block, grid
+    def _get_sl_buffer_size(self, n):
+        return REAL_SIZE * 2 * 3 * MAX_SLINE_LEN * self.nSlines[n].astype(np.int64)
+    def _allocate_seed_memory(self, seeds):
+        # Move seeds to GPU
+        for ii in range(self.ngpus):
+            nseeds_gpu, _, _ = self._switch_device(ii)
+            self.seeds_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(REAL_SIZE * 3 * nseeds_gpu)
+            )
+            seeds_host = np.ascontiguousarray(
+                seeds[ii * self.nseeds_per_gpu : ii * self.nseeds_per_gpu + nseeds_gpu],
+                dtype=REAL_DTYPE,
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.seeds_d[ii],
+                    seeds_host.ctypes.data,
+                    REAL_SIZE * 3 * nseeds_gpu,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            # Streamline offsets
+            self.slinesOffs_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(np.int32().nbytes * (nseeds_gpu + 1))
+            )
+            # Initial directions from each seed
+            self.shDirTemp0_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(
+                    REAL3_DTYPE.itemsize
+                    * self.gpu_tracker.samplm_nr
+                    * grid[0]
+                    * block[1]
+                )
+            )
+    def _cumsum_offsets(
+        self,
+    ):  # TODO: performance: do this on device? not crucial for performance now
+        for ii in range(self.ngpus):
+            nseeds_gpu, _, _ = self._switch_device(ii)
+            if nseeds_gpu == 0:
+                self.nSlines[ii] = 0
+                continue
+            slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32)
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    slinesOffs_h.ctypes.data,
+                    self.slinesOffs_d[ii],
+                    slinesOffs_h.nbytes,
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                )
+            )
+            __pval = slinesOffs_h[0]
+            slinesOffs_h[0] = 0
+            for jj in range(1, nseeds_gpu + 1):
+                __cval = slinesOffs_h[jj]
+                slinesOffs_h[jj] = slinesOffs_h[jj - 1] + __pval
+                __pval = __cval
+            self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu])
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.slinesOffs_d[ii],
+                    slinesOffs_h.ctypes.data,
+                    slinesOffs_h.nbytes,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+    def _allocate_tracking_memory(self):
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            self.slineSeed_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(self.nSlines[ii] * np.int32().nbytes)
+            )
+            checkCudaErrors(
+                runtime.cudaMemset(
+                    self.slineSeed_d[ii], -1, self.nSlines[ii] * np.int32().nbytes
+                )
+            )
+            if self.nSlines[ii] > EXCESS_ALLOC_FACT * self.nSlines_old[ii]:
+                self.slines[ii] = 0
+                self.sline_lens[ii] = 0
+                gc.collect()
+            buffer_size = self._get_sl_buffer_size(ii)
+            logger.debug(f"Streamline buffer size: {buffer_size}")
+            if not self.slines[ii]:
+                self.slines[ii] = np.empty(
+                    (EXCESS_ALLOC_FACT * self.nSlines[ii], MAX_SLINE_LEN * 2, 3),
+                    dtype=REAL_DTYPE,
+                )
+            if not self.sline_lens[ii]:
+                self.sline_lens[ii] = np.empty(
+                    EXCESS_ALLOC_FACT * self.nSlines[ii], dtype=np.int32
+                )
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            buffer_size = self._get_sl_buffer_size(ii)
+            self.slineLen_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(np.int32().nbytes * self.nSlines[ii])
+            )
+            self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(buffer_size))
+    def _cleanup(self):
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            checkCudaErrors(
+                runtime.cudaMemcpyAsync(
+                    self.slines[ii],
+                    self.sline_d[ii],
+                    self._get_sl_buffer_size(ii),
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                    self.gpu_tracker.streams[ii],
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpyAsync(
+                    self.sline_lens[ii],
+                    self.slineLen_d[ii],
+                    np.int32().nbytes * self.nSlines[ii],
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                    self.gpu_tracker.streams[ii],
+                )
+            )
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaFree(self.seeds_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.shDirTemp0_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.sline_d[ii]))
+        self.nSlines_old = self.nSlines
+        self.gpu_tracker.rng_offset += self.nseeds
+    # TODO: performance: better queuing/batching of seeds,
+    # if more performance needed,
+    # given exponential nature of streamlines
+    # May be better to do in cuda code directly
+    def propagate(self, seeds):
+        self.nseeds = len(seeds)
+        self.nseeds_per_gpu = (
+            self.nseeds + self.gpu_tracker.ngpus - 1
+        ) // self.gpu_tracker.ngpus
+        self._allocate_seed_memory(seeds)
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            if nseeds_gpu == 0:
+                continue
+            self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self)
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
+        self._cumsum_offsets()
+        self._allocate_tracking_memory()
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            if nseeds_gpu == 0:
+                continue
+            self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self)
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
+        self._cleanup()
+    def get_buffer_size(self):
+        buffer_size = 0
+        for ii in range(self.ngpus):
+            lens = self.sline_lens[ii]
+            for jj in range(self.nSlines[ii]):
+                buffer_size += lens[jj] * 3 * REAL_SIZE
+        return buffer_size
+    def as_generator(self):
+        def _yield_slines():
+            for ii in range(self.ngpus):
+                this_sls = self.slines[ii]
+                this_len = self.sline_lens[ii]
+                for jj in range(self.nSlines[ii]):
+                    npts = this_len[jj]
+                    yield np.asarray(this_sls[jj], dtype=REAL_DTYPE)[:npts]
+        return _yield_slines()
+    def as_array_sequence(self):
+        return ArraySequence(self.as_generator(), self.get_buffer_size() // MEGABYTE)

cuslines/cuda_python/cu_tractography.py ADDED Viewed

@@ -0,0 +1,315 @@
+from cuda.bindings import runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+# TODO: consider cuda core over cuda bindings
+import numpy as np
+from tqdm import tqdm
+import logging
+from math import radians
+from cuslines.cuda_python.cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    checkCudaErrors,
+)
+from cuslines.cuda_python.cu_direction_getters import (
+    GPUDirectionGetter,
+    BootDirectionGetter,
+)
+from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator
+from trx.trx_file_memmap import TrxFile
+from nibabel.streamlines.tractogram import Tractogram
+from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
+from dipy.io.stateful_tractogram import Space, StatefulTractogram
+logger = logging.getLogger("GPUStreamlines")
+# TODO performance:
+# ACT
+# SCIL streamline reduction onboard GPU
+# Remove small/long streamlines on gpu
+class GPUTracker:
+    def __init__(
+        self,
+        dg: GPUDirectionGetter,
+        dataf: np.ndarray,
+        stop_map: np.ndarray,
+        stop_theshold: float,
+        sphere_vertices: np.ndarray,
+        sphere_edges: np.ndarray,
+        max_angle: float = radians(60),
+        step_size: float = 0.5,
+        relative_peak_thresh: float = 0.25,
+        min_separation_angle: float = radians(45),
+        ngpus: int = 1,
+        rng_seed: int = 0,
+        rng_offset: int = 0,
+        chunk_size: int = 100000,
+    ):
+        """
+        Initialize GPUTracker with necessary data.
+        Parameters
+        ----------
+        dg : GPUDirectionGetter
+            Direction getter to use for tracking from
+            cuslines.cu_direction_getters
+        dataf : np.ndarray
+            4D numpy array with ODFs for prob/ptt, diffusion data if doing
+            bootstrapping.
+        stop_map : np.ndarray
+            3D numpy array with stopping metric (e.g., GFA, FA)
+        stop_theshold : float
+            Threshold for stopping metric (e.g., 0.2)
+        sphere_vertices : np.ndarray
+            Vertices of the sphere used for direction sampling.
+        sphere_edges : np.ndarray
+            Edges of the sphere used for direction sampling.
+        max_angle : float, optional
+            Maximum angle (in radians) between steps
+            default: radians(60)
+        step_size : float, optional
+            Step size for tracking
+            default: 0.5
+        relative_peak_thresh : float, optional
+            Relative peak threshold for direction selection
+            default: 0.25
+        min_separation_angle : float, optional
+            Minimum separation angle (in radians) between peaks
+            default: radians(45)
+        ngpus : int, optional
+            Number of GPUs to use
+            default: 1
+        rng_seed : int, optional
+            Seed for random number generator
+            default: 0
+        rng_offset : int, optional
+            Offset for random number generator
+            default: 0
+        """
+        self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE)
+        self.metric_map = np.ascontiguousarray(stop_map, dtype=REAL_DTYPE)
+        self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE)
+        self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32)
+        self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
+        self.nedges = int(sphere_edges.shape[0])
+        if isinstance(dg, BootDirectionGetter):
+            self.samplm_nr = int(dg.sampling_matrix.shape[0])
+        else:
+            self.samplm_nr = self.dimt
+        self.n32dimt = ((self.dimt + 31) // 32) * 32
+        self.dg = dg
+        self.max_angle = REAL_DTYPE(max_angle)
+        self.tc_threshold = REAL_DTYPE(stop_theshold)
+        self.step_size = REAL_DTYPE(step_size)
+        self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
+        self.min_separation_angle = REAL_DTYPE(min_separation_angle)
+        self.ngpus = int(ngpus)
+        self.rng_seed = int(rng_seed)
+        self.rng_offset = int(rng_offset)
+        self.chunk_size = int(chunk_size)
+        avail = checkCudaErrors(runtime.cudaGetDeviceCount())
+        if self.ngpus > avail:
+            raise RuntimeError(
+                f"Requested {self.ngpus} GPUs but only {avail} available"
+            )
+        logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
+        self.dataf_d = []
+        self.metric_map_d = []
+        self.sphere_vertices_d = []
+        self.sphere_edges_d = []
+        self.streams = []
+        self.managed_data = []
+        self.seed_propagator = SeedBatchPropagator(gpu_tracker=self)
+        self._allocated = False
+    def __enter__(self):
+        self._allocate()
+        return self
+    def _allocate(self):
+        if self._allocated:
+            return
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            self.streams.append(
+                checkCudaErrors(
+                    runtime.cudaStreamCreateWithFlags(runtime.cudaStreamNonBlocking)
+                )
+            )
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            # TODO: performance: dataf could be managed or texture memory instead?
+            self.dataf_d.append(
+                checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.dataf.size))
+            )
+            self.metric_map_d.append(
+                checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.metric_map.size))
+            )
+            self.sphere_vertices_d.append(
+                checkCudaErrors(
+                    runtime.cudaMalloc(REAL_SIZE * self.sphere_vertices.size)
+                )
+            )
+            self.sphere_edges_d.append(
+                checkCudaErrors(
+                    runtime.cudaMalloc(np.int32().nbytes * self.sphere_edges.size)
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.dataf_d[ii],
+                    self.dataf.ctypes.data,
+                    REAL_SIZE * self.dataf.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.metric_map_d[ii],
+                    self.metric_map.ctypes.data,
+                    REAL_SIZE * self.metric_map.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.sphere_vertices_d[ii],
+                    self.sphere_vertices.ctypes.data,
+                    REAL_SIZE * self.sphere_vertices.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.sphere_edges_d[ii],
+                    self.sphere_edges.ctypes.data,
+                    np.int32().nbytes * self.sphere_edges.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            self.dg.allocate_on_gpu(ii)
+        self._allocated = True
+    def __exit__(self, exc_type, exc, tb):
+        logger.info("Destroying GPUTracker and freeing GPU memory...")
+        for n in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(n))
+            if self.dataf_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.dataf_d[n]))
+            if self.metric_map_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.metric_map_d[n]))
+            if self.sphere_vertices_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n]))
+            if self.sphere_edges_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n]))
+            self.dg.deallocate_on_gpu(n)
+            checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n]))
+        return False
+    def _divide_chunks(self, seeds):
+        global_chunk_sz = self.chunk_size * self.ngpus
+        nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz
+        return global_chunk_sz, nchunks
+    def generate_sft(self, seeds, ref_img):
+        global_chunk_sz, nchunks = self._divide_chunks(seeds)
+        buffer_size = 0
+        generators = []
+        with tqdm(total=seeds.shape[0]) as pbar:
+            for idx in range(nchunks):
+                self.seed_propagator.propagate(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
+                )
+                buffer_size += self.seed_propagator.get_buffer_size()
+                generators.append(self.seed_propagator.as_generator())
+                pbar.update(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
+                )
+        array_sequence = ArraySequence(
+            (item for gen in generators for item in gen), buffer_size // MEGABYTE
+        )
+        return StatefulTractogram(array_sequence, ref_img, Space.VOX)
+    # TODO: performance: consider a way to just output in VOX space directly
+    def generate_trx(self, seeds, ref_img):
+        global_chunk_sz, nchunks = self._divide_chunks(seeds)
+        # Will resize by a factor of 2 if these are exceeded
+        sl_len_guess = 100
+        sl_per_seed_guess = 3
+        n_sls_guess = sl_per_seed_guess * seeds.shape[0]
+        # trx files use memory mapping
+        trx_file = TrxFile(
+            reference=ref_img,
+            nb_streamlines=n_sls_guess,
+            nb_vertices=n_sls_guess * sl_len_guess,
+        )
+        trx_file.streamlines._offsets = trx_file.streamlines._offsets.astype(np.uint64)
+        offsets_idx = 0
+        sls_data_idx = 0
+        with tqdm(total=seeds.shape[0]) as pbar:
+            for idx in range(int(nchunks)):
+                self.seed_propagator.propagate(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
+                )
+                tractogram = Tractogram(
+                    self.seed_propagator.as_array_sequence(),
+                    affine_to_rasmm=ref_img.affine,
+                )
+                tractogram.to_world()
+                sls = tractogram.streamlines
+                new_offsets_idx = offsets_idx + len(sls._offsets)
+                new_sls_data_idx = sls_data_idx + len(sls._data)
+                if (
+                    new_offsets_idx > trx_file.header["NB_STREAMLINES"]
+                    or new_sls_data_idx > trx_file.header["NB_VERTICES"]
+                ):
+                    logger.info("TRX resizing...")
+                    trx_file.resize(
+                        nb_streamlines=new_offsets_idx * 2,
+                        nb_vertices=new_sls_data_idx * 2,
+                    )
+                # TRX uses memmaps here
+                trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
+                trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = (
+                    sls_data_idx + sls._offsets
+                )
+                trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = (
+                    sls._lengths
+                )
+                offsets_idx = new_offsets_idx
+                sls_data_idx = new_sls_data_idx
+                pbar.update(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
+                )
+        trx_file.resize()
+        return trx_file

cuslines/cuda_python/cutils.py ADDED Viewed

@@ -0,0 +1,64 @@
+from cuda.bindings import driver, nvrtc
+import numpy as np
+from enum import IntEnum
+from cuslines.cuda_python._globals import *
+class ModelType(IntEnum):
+    OPDT = 0
+    CSA = 1
+    PROB = 2
+    PTT = 3
+REAL3_SIZE = 3 * REAL_SIZE
+if REAL_SIZE == 4:
+    REAL_DTYPE = np.float32
+    REAL3_DTYPE = np.dtype(
+        [("x", np.float32), ("y", np.float32), ("z", np.float32)], align=True
+    )
+    REAL_DTYPE_AS_STR = "float"
+    REAL3_DTYPE_AS_STR = "float3"
+elif REAL_SIZE == 8:
+    REAL_DTYPE = np.float64
+    REAL3_DTYPE = np.dtype(
+        [("x", np.float64), ("y", np.float64), ("z", np.float64)], align=True
+    )
+    REAL_DTYPE_AS_STR = "double"
+    REAL3_DTYPE_AS_STR = "double3"
+else:
+    raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
+BLOCK_Y = THR_X_BL // THR_X_SL
+DEV_PTR = object
+def _cudaGetErrorEnum(error):
+    if isinstance(error, driver.CUresult):
+        err, name = driver.cuGetErrorName(error)
+        return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise RuntimeError("Unknown error type: {}".format(error))
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError(
+            "CUDA error code={}({})".format(
+                result[0].value, _cudaGetErrorEnum(result[0])
+            )
+        )
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+def div_up(a, b):
+    return (a + b - 1) // b