cuslines 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ import numpy as np
2
+ import gc
3
+ from cuda.bindings import runtime
4
+ from cuda.bindings.runtime import cudaMemcpyKind
5
+
6
+ from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
7
+ import logging
8
+
9
+ from cuslines.cuda_python.cutils import (
10
+ REAL_SIZE,
11
+ REAL_DTYPE,
12
+ REAL3_DTYPE,
13
+ MAX_SLINE_LEN,
14
+ EXCESS_ALLOC_FACT,
15
+ THR_X_SL,
16
+ THR_X_BL,
17
+ DEV_PTR,
18
+ div_up,
19
+ checkCudaErrors,
20
+ )
21
+
22
+
23
+ logger = logging.getLogger("GPUStreamlines")
24
+
25
+
26
+ class SeedBatchPropagator:
27
+ def __init__(self, gpu_tracker):
28
+ self.gpu_tracker = gpu_tracker
29
+ self.ngpus = gpu_tracker.ngpus
30
+
31
+ self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32)
32
+ self.nSlines = np.zeros(self.ngpus, dtype=np.int32)
33
+ self.slines = np.zeros(self.ngpus, dtype=np.ndarray)
34
+ self.sline_lens = np.zeros(self.ngpus, dtype=np.ndarray)
35
+
36
+ self.seeds_d = np.empty(self.ngpus, dtype=DEV_PTR)
37
+ self.slineSeed_d = np.empty(self.ngpus, dtype=DEV_PTR)
38
+ self.slinesOffs_d = np.empty(self.ngpus, dtype=DEV_PTR)
39
+ self.shDirTemp0_d = np.empty(self.ngpus, dtype=DEV_PTR)
40
+ self.slineLen_d = np.empty(self.ngpus, dtype=DEV_PTR)
41
+ self.sline_d = np.empty(self.ngpus, dtype=DEV_PTR)
42
+
43
+ def _switch_device(self, n):
44
+ checkCudaErrors(runtime.cudaSetDevice(n))
45
+
46
+ nseeds_gpu = min(
47
+ self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu)
48
+ )
49
+ block = (THR_X_SL, THR_X_BL // THR_X_SL, 1)
50
+ grid = (div_up(nseeds_gpu, THR_X_BL // THR_X_SL), 1, 1)
51
+
52
+ return nseeds_gpu, block, grid
53
+
54
+ def _get_sl_buffer_size(self, n):
55
+ return REAL_SIZE * 2 * 3 * MAX_SLINE_LEN * self.nSlines[n].astype(np.int64)
56
+
57
+ def _allocate_seed_memory(self, seeds):
58
+ # Move seeds to GPU
59
+ for ii in range(self.ngpus):
60
+ nseeds_gpu, _, _ = self._switch_device(ii)
61
+ self.seeds_d[ii] = checkCudaErrors(
62
+ runtime.cudaMalloc(REAL_SIZE * 3 * nseeds_gpu)
63
+ )
64
+ seeds_host = np.ascontiguousarray(
65
+ seeds[ii * self.nseeds_per_gpu : ii * self.nseeds_per_gpu + nseeds_gpu],
66
+ dtype=REAL_DTYPE,
67
+ )
68
+ checkCudaErrors(
69
+ runtime.cudaMemcpy(
70
+ self.seeds_d[ii],
71
+ seeds_host.ctypes.data,
72
+ REAL_SIZE * 3 * nseeds_gpu,
73
+ cudaMemcpyKind.cudaMemcpyHostToDevice,
74
+ )
75
+ )
76
+
77
+ for ii in range(self.ngpus):
78
+ nseeds_gpu, block, grid = self._switch_device(ii)
79
+ # Streamline offsets
80
+ self.slinesOffs_d[ii] = checkCudaErrors(
81
+ runtime.cudaMalloc(np.int32().nbytes * (nseeds_gpu + 1))
82
+ )
83
+ # Initial directions from each seed
84
+ self.shDirTemp0_d[ii] = checkCudaErrors(
85
+ runtime.cudaMalloc(
86
+ REAL3_DTYPE.itemsize
87
+ * self.gpu_tracker.samplm_nr
88
+ * grid[0]
89
+ * block[1]
90
+ )
91
+ )
92
+
93
+ def _cumsum_offsets(
94
+ self,
95
+ ): # TODO: performance: do this on device? not crucial for performance now
96
+ for ii in range(self.ngpus):
97
+ nseeds_gpu, _, _ = self._switch_device(ii)
98
+ if nseeds_gpu == 0:
99
+ self.nSlines[ii] = 0
100
+ continue
101
+
102
+ slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32)
103
+ checkCudaErrors(
104
+ runtime.cudaMemcpy(
105
+ slinesOffs_h.ctypes.data,
106
+ self.slinesOffs_d[ii],
107
+ slinesOffs_h.nbytes,
108
+ cudaMemcpyKind.cudaMemcpyDeviceToHost,
109
+ )
110
+ )
111
+
112
+ __pval = slinesOffs_h[0]
113
+ slinesOffs_h[0] = 0
114
+ for jj in range(1, nseeds_gpu + 1):
115
+ __cval = slinesOffs_h[jj]
116
+ slinesOffs_h[jj] = slinesOffs_h[jj - 1] + __pval
117
+ __pval = __cval
118
+ self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu])
119
+
120
+ checkCudaErrors(
121
+ runtime.cudaMemcpy(
122
+ self.slinesOffs_d[ii],
123
+ slinesOffs_h.ctypes.data,
124
+ slinesOffs_h.nbytes,
125
+ cudaMemcpyKind.cudaMemcpyHostToDevice,
126
+ )
127
+ )
128
+
129
+ def _allocate_tracking_memory(self):
130
+ for ii in range(self.ngpus):
131
+ self._switch_device(ii)
132
+
133
+ self.slineSeed_d[ii] = checkCudaErrors(
134
+ runtime.cudaMalloc(self.nSlines[ii] * np.int32().nbytes)
135
+ )
136
+ checkCudaErrors(
137
+ runtime.cudaMemset(
138
+ self.slineSeed_d[ii], -1, self.nSlines[ii] * np.int32().nbytes
139
+ )
140
+ )
141
+
142
+ if self.nSlines[ii] > EXCESS_ALLOC_FACT * self.nSlines_old[ii]:
143
+ self.slines[ii] = 0
144
+ self.sline_lens[ii] = 0
145
+ gc.collect()
146
+
147
+ buffer_size = self._get_sl_buffer_size(ii)
148
+ logger.debug(f"Streamline buffer size: {buffer_size}")
149
+
150
+ if not self.slines[ii]:
151
+ self.slines[ii] = np.empty(
152
+ (EXCESS_ALLOC_FACT * self.nSlines[ii], MAX_SLINE_LEN * 2, 3),
153
+ dtype=REAL_DTYPE,
154
+ )
155
+ if not self.sline_lens[ii]:
156
+ self.sline_lens[ii] = np.empty(
157
+ EXCESS_ALLOC_FACT * self.nSlines[ii], dtype=np.int32
158
+ )
159
+
160
+ for ii in range(self.ngpus):
161
+ self._switch_device(ii)
162
+ buffer_size = self._get_sl_buffer_size(ii)
163
+
164
+ self.slineLen_d[ii] = checkCudaErrors(
165
+ runtime.cudaMalloc(np.int32().nbytes * self.nSlines[ii])
166
+ )
167
+ self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(buffer_size))
168
+
169
+ def _cleanup(self):
170
+ for ii in range(self.ngpus):
171
+ self._switch_device(ii)
172
+ checkCudaErrors(
173
+ runtime.cudaMemcpyAsync(
174
+ self.slines[ii],
175
+ self.sline_d[ii],
176
+ self._get_sl_buffer_size(ii),
177
+ cudaMemcpyKind.cudaMemcpyDeviceToHost,
178
+ self.gpu_tracker.streams[ii],
179
+ )
180
+ )
181
+ checkCudaErrors(
182
+ runtime.cudaMemcpyAsync(
183
+ self.sline_lens[ii],
184
+ self.slineLen_d[ii],
185
+ np.int32().nbytes * self.nSlines[ii],
186
+ cudaMemcpyKind.cudaMemcpyDeviceToHost,
187
+ self.gpu_tracker.streams[ii],
188
+ )
189
+ )
190
+
191
+ for ii in range(self.ngpus):
192
+ self._switch_device(ii)
193
+ checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
194
+ checkCudaErrors(runtime.cudaFree(self.seeds_d[ii]))
195
+ checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii]))
196
+ checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii]))
197
+ checkCudaErrors(runtime.cudaFree(self.shDirTemp0_d[ii]))
198
+ checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii]))
199
+ checkCudaErrors(runtime.cudaFree(self.sline_d[ii]))
200
+
201
+ self.nSlines_old = self.nSlines
202
+ self.gpu_tracker.rng_offset += self.nseeds
203
+
204
+ # TODO: performance: better queuing/batching of seeds,
205
+ # if more performance needed,
206
+ # given exponential nature of streamlines
207
+ # May be better to do in cuda code directly
208
+ def propagate(self, seeds):
209
+ self.nseeds = len(seeds)
210
+ self.nseeds_per_gpu = (
211
+ self.nseeds + self.gpu_tracker.ngpus - 1
212
+ ) // self.gpu_tracker.ngpus
213
+
214
+ self._allocate_seed_memory(seeds)
215
+
216
+ for ii in range(self.ngpus):
217
+ nseeds_gpu, block, grid = self._switch_device(ii)
218
+ if nseeds_gpu == 0:
219
+ continue
220
+ self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self)
221
+ for ii in range(self.ngpus):
222
+ checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
223
+
224
+ self._cumsum_offsets()
225
+ self._allocate_tracking_memory()
226
+
227
+ for ii in range(self.ngpus):
228
+ nseeds_gpu, block, grid = self._switch_device(ii)
229
+ if nseeds_gpu == 0:
230
+ continue
231
+ self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self)
232
+ for ii in range(self.ngpus):
233
+ checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
234
+
235
+ self._cleanup()
236
+
237
+ def get_buffer_size(self):
238
+ buffer_size = 0
239
+ for ii in range(self.ngpus):
240
+ lens = self.sline_lens[ii]
241
+ for jj in range(self.nSlines[ii]):
242
+ buffer_size += lens[jj] * 3 * REAL_SIZE
243
+ return buffer_size
244
+
245
+ def as_generator(self):
246
+ def _yield_slines():
247
+ for ii in range(self.ngpus):
248
+ this_sls = self.slines[ii]
249
+ this_len = self.sline_lens[ii]
250
+
251
+ for jj in range(self.nSlines[ii]):
252
+ npts = this_len[jj]
253
+
254
+ yield np.asarray(this_sls[jj], dtype=REAL_DTYPE)[:npts]
255
+
256
+ return _yield_slines()
257
+
258
+ def as_array_sequence(self):
259
+ return ArraySequence(self.as_generator(), self.get_buffer_size() // MEGABYTE)
@@ -0,0 +1,315 @@
1
+ from cuda.bindings import runtime
2
+ from cuda.bindings.runtime import cudaMemcpyKind
3
+ # TODO: consider cuda core over cuda bindings
4
+
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ import logging
8
+ from math import radians
9
+
10
+ from cuslines.cuda_python.cutils import (
11
+ REAL_SIZE,
12
+ REAL_DTYPE,
13
+ checkCudaErrors,
14
+ )
15
+ from cuslines.cuda_python.cu_direction_getters import (
16
+ GPUDirectionGetter,
17
+ BootDirectionGetter,
18
+ )
19
+ from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator
20
+
21
+ from trx.trx_file_memmap import TrxFile
22
+
23
+ from nibabel.streamlines.tractogram import Tractogram
24
+ from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
25
+
26
+ from dipy.io.stateful_tractogram import Space, StatefulTractogram
27
+
28
+ logger = logging.getLogger("GPUStreamlines")
29
+
30
+ # TODO performance:
31
+ # ACT
32
+ # SCIL streamline reduction onboard GPU
33
+ # Remove small/long streamlines on gpu
34
+
35
+
36
+ class GPUTracker:
37
+ def __init__(
38
+ self,
39
+ dg: GPUDirectionGetter,
40
+ dataf: np.ndarray,
41
+ stop_map: np.ndarray,
42
+ stop_theshold: float,
43
+ sphere_vertices: np.ndarray,
44
+ sphere_edges: np.ndarray,
45
+ max_angle: float = radians(60),
46
+ step_size: float = 0.5,
47
+ relative_peak_thresh: float = 0.25,
48
+ min_separation_angle: float = radians(45),
49
+ ngpus: int = 1,
50
+ rng_seed: int = 0,
51
+ rng_offset: int = 0,
52
+ chunk_size: int = 100000,
53
+ ):
54
+ """
55
+ Initialize GPUTracker with necessary data.
56
+
57
+ Parameters
58
+ ----------
59
+ dg : GPUDirectionGetter
60
+ Direction getter to use for tracking from
61
+ cuslines.cu_direction_getters
62
+ dataf : np.ndarray
63
+ 4D numpy array with ODFs for prob/ptt, diffusion data if doing
64
+ bootstrapping.
65
+ stop_map : np.ndarray
66
+ 3D numpy array with stopping metric (e.g., GFA, FA)
67
+ stop_theshold : float
68
+ Threshold for stopping metric (e.g., 0.2)
69
+ sphere_vertices : np.ndarray
70
+ Vertices of the sphere used for direction sampling.
71
+ sphere_edges : np.ndarray
72
+ Edges of the sphere used for direction sampling.
73
+ max_angle : float, optional
74
+ Maximum angle (in radians) between steps
75
+ default: radians(60)
76
+ step_size : float, optional
77
+ Step size for tracking
78
+ default: 0.5
79
+ relative_peak_thresh : float, optional
80
+ Relative peak threshold for direction selection
81
+ default: 0.25
82
+ min_separation_angle : float, optional
83
+ Minimum separation angle (in radians) between peaks
84
+ default: radians(45)
85
+ ngpus : int, optional
86
+ Number of GPUs to use
87
+ default: 1
88
+ rng_seed : int, optional
89
+ Seed for random number generator
90
+ default: 0
91
+ rng_offset : int, optional
92
+ Offset for random number generator
93
+ default: 0
94
+ """
95
+ self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE)
96
+ self.metric_map = np.ascontiguousarray(stop_map, dtype=REAL_DTYPE)
97
+ self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE)
98
+ self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32)
99
+
100
+ self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
101
+ self.nedges = int(sphere_edges.shape[0])
102
+ if isinstance(dg, BootDirectionGetter):
103
+ self.samplm_nr = int(dg.sampling_matrix.shape[0])
104
+ else:
105
+ self.samplm_nr = self.dimt
106
+ self.n32dimt = ((self.dimt + 31) // 32) * 32
107
+
108
+ self.dg = dg
109
+ self.max_angle = REAL_DTYPE(max_angle)
110
+ self.tc_threshold = REAL_DTYPE(stop_theshold)
111
+ self.step_size = REAL_DTYPE(step_size)
112
+ self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
113
+ self.min_separation_angle = REAL_DTYPE(min_separation_angle)
114
+
115
+ self.ngpus = int(ngpus)
116
+ self.rng_seed = int(rng_seed)
117
+ self.rng_offset = int(rng_offset)
118
+ self.chunk_size = int(chunk_size)
119
+
120
+ avail = checkCudaErrors(runtime.cudaGetDeviceCount())
121
+ if self.ngpus > avail:
122
+ raise RuntimeError(
123
+ f"Requested {self.ngpus} GPUs but only {avail} available"
124
+ )
125
+
126
+ logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
127
+
128
+ self.dataf_d = []
129
+ self.metric_map_d = []
130
+ self.sphere_vertices_d = []
131
+ self.sphere_edges_d = []
132
+
133
+ self.streams = []
134
+ self.managed_data = []
135
+
136
+ self.seed_propagator = SeedBatchPropagator(gpu_tracker=self)
137
+ self._allocated = False
138
+
139
+ def __enter__(self):
140
+ self._allocate()
141
+ return self
142
+
143
+ def _allocate(self):
144
+ if self._allocated:
145
+ return
146
+
147
+ for ii in range(self.ngpus):
148
+ checkCudaErrors(runtime.cudaSetDevice(ii))
149
+ self.streams.append(
150
+ checkCudaErrors(
151
+ runtime.cudaStreamCreateWithFlags(runtime.cudaStreamNonBlocking)
152
+ )
153
+ )
154
+
155
+ for ii in range(self.ngpus):
156
+ checkCudaErrors(runtime.cudaSetDevice(ii))
157
+
158
+ # TODO: performance: dataf could be managed or texture memory instead?
159
+ self.dataf_d.append(
160
+ checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.dataf.size))
161
+ )
162
+ self.metric_map_d.append(
163
+ checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.metric_map.size))
164
+ )
165
+ self.sphere_vertices_d.append(
166
+ checkCudaErrors(
167
+ runtime.cudaMalloc(REAL_SIZE * self.sphere_vertices.size)
168
+ )
169
+ )
170
+ self.sphere_edges_d.append(
171
+ checkCudaErrors(
172
+ runtime.cudaMalloc(np.int32().nbytes * self.sphere_edges.size)
173
+ )
174
+ )
175
+
176
+ checkCudaErrors(
177
+ runtime.cudaMemcpy(
178
+ self.dataf_d[ii],
179
+ self.dataf.ctypes.data,
180
+ REAL_SIZE * self.dataf.size,
181
+ cudaMemcpyKind.cudaMemcpyHostToDevice,
182
+ )
183
+ )
184
+ checkCudaErrors(
185
+ runtime.cudaMemcpy(
186
+ self.metric_map_d[ii],
187
+ self.metric_map.ctypes.data,
188
+ REAL_SIZE * self.metric_map.size,
189
+ cudaMemcpyKind.cudaMemcpyHostToDevice,
190
+ )
191
+ )
192
+ checkCudaErrors(
193
+ runtime.cudaMemcpy(
194
+ self.sphere_vertices_d[ii],
195
+ self.sphere_vertices.ctypes.data,
196
+ REAL_SIZE * self.sphere_vertices.size,
197
+ cudaMemcpyKind.cudaMemcpyHostToDevice,
198
+ )
199
+ )
200
+ checkCudaErrors(
201
+ runtime.cudaMemcpy(
202
+ self.sphere_edges_d[ii],
203
+ self.sphere_edges.ctypes.data,
204
+ np.int32().nbytes * self.sphere_edges.size,
205
+ cudaMemcpyKind.cudaMemcpyHostToDevice,
206
+ )
207
+ )
208
+ self.dg.allocate_on_gpu(ii)
209
+
210
+ self._allocated = True
211
+
212
+ def __exit__(self, exc_type, exc, tb):
213
+ logger.info("Destroying GPUTracker and freeing GPU memory...")
214
+
215
+ for n in range(self.ngpus):
216
+ checkCudaErrors(runtime.cudaSetDevice(n))
217
+ if self.dataf_d[n]:
218
+ checkCudaErrors(runtime.cudaFree(self.dataf_d[n]))
219
+ if self.metric_map_d[n]:
220
+ checkCudaErrors(runtime.cudaFree(self.metric_map_d[n]))
221
+ if self.sphere_vertices_d[n]:
222
+ checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n]))
223
+ if self.sphere_edges_d[n]:
224
+ checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n]))
225
+ self.dg.deallocate_on_gpu(n)
226
+
227
+ checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n]))
228
+ return False
229
+
230
+ def _divide_chunks(self, seeds):
231
+ global_chunk_sz = self.chunk_size * self.ngpus
232
+ nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz
233
+ return global_chunk_sz, nchunks
234
+
235
+ def generate_sft(self, seeds, ref_img):
236
+ global_chunk_sz, nchunks = self._divide_chunks(seeds)
237
+ buffer_size = 0
238
+ generators = []
239
+
240
+ with tqdm(total=seeds.shape[0]) as pbar:
241
+ for idx in range(nchunks):
242
+ self.seed_propagator.propagate(
243
+ seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
244
+ )
245
+ buffer_size += self.seed_propagator.get_buffer_size()
246
+ generators.append(self.seed_propagator.as_generator())
247
+ pbar.update(
248
+ seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
249
+ )
250
+ array_sequence = ArraySequence(
251
+ (item for gen in generators for item in gen), buffer_size // MEGABYTE
252
+ )
253
+ return StatefulTractogram(array_sequence, ref_img, Space.VOX)
254
+
255
+ # TODO: performance: consider a way to just output in VOX space directly
256
+ def generate_trx(self, seeds, ref_img):
257
+ global_chunk_sz, nchunks = self._divide_chunks(seeds)
258
+
259
+ # Will resize by a factor of 2 if these are exceeded
260
+ sl_len_guess = 100
261
+ sl_per_seed_guess = 3
262
+ n_sls_guess = sl_per_seed_guess * seeds.shape[0]
263
+
264
+ # trx files use memory mapping
265
+ trx_file = TrxFile(
266
+ reference=ref_img,
267
+ nb_streamlines=n_sls_guess,
268
+ nb_vertices=n_sls_guess * sl_len_guess,
269
+ )
270
+ trx_file.streamlines._offsets = trx_file.streamlines._offsets.astype(np.uint64)
271
+ offsets_idx = 0
272
+ sls_data_idx = 0
273
+
274
+ with tqdm(total=seeds.shape[0]) as pbar:
275
+ for idx in range(int(nchunks)):
276
+ self.seed_propagator.propagate(
277
+ seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
278
+ )
279
+ tractogram = Tractogram(
280
+ self.seed_propagator.as_array_sequence(),
281
+ affine_to_rasmm=ref_img.affine,
282
+ )
283
+ tractogram.to_world()
284
+ sls = tractogram.streamlines
285
+
286
+ new_offsets_idx = offsets_idx + len(sls._offsets)
287
+ new_sls_data_idx = sls_data_idx + len(sls._data)
288
+
289
+ if (
290
+ new_offsets_idx > trx_file.header["NB_STREAMLINES"]
291
+ or new_sls_data_idx > trx_file.header["NB_VERTICES"]
292
+ ):
293
+ logger.info("TRX resizing...")
294
+ trx_file.resize(
295
+ nb_streamlines=new_offsets_idx * 2,
296
+ nb_vertices=new_sls_data_idx * 2,
297
+ )
298
+
299
+ # TRX uses memmaps here
300
+ trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
301
+ trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = (
302
+ sls_data_idx + sls._offsets
303
+ )
304
+ trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = (
305
+ sls._lengths
306
+ )
307
+
308
+ offsets_idx = new_offsets_idx
309
+ sls_data_idx = new_sls_data_idx
310
+ pbar.update(
311
+ seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
312
+ )
313
+ trx_file.resize()
314
+
315
+ return trx_file
@@ -0,0 +1,64 @@
1
+ from cuda.bindings import driver, nvrtc
2
+
3
+ import numpy as np
4
+
5
+ from enum import IntEnum
6
+
7
+ from cuslines.cuda_python._globals import *
8
+
9
+
10
+ class ModelType(IntEnum):
11
+ OPDT = 0
12
+ CSA = 1
13
+ PROB = 2
14
+ PTT = 3
15
+
16
+
17
+ REAL3_SIZE = 3 * REAL_SIZE
18
+ if REAL_SIZE == 4:
19
+ REAL_DTYPE = np.float32
20
+ REAL3_DTYPE = np.dtype(
21
+ [("x", np.float32), ("y", np.float32), ("z", np.float32)], align=True
22
+ )
23
+ REAL_DTYPE_AS_STR = "float"
24
+ REAL3_DTYPE_AS_STR = "float3"
25
+ elif REAL_SIZE == 8:
26
+ REAL_DTYPE = np.float64
27
+ REAL3_DTYPE = np.dtype(
28
+ [("x", np.float64), ("y", np.float64), ("z", np.float64)], align=True
29
+ )
30
+ REAL_DTYPE_AS_STR = "double"
31
+ REAL3_DTYPE_AS_STR = "double3"
32
+ else:
33
+ raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
34
+ BLOCK_Y = THR_X_BL // THR_X_SL
35
+ DEV_PTR = object
36
+
37
+
38
+ def _cudaGetErrorEnum(error):
39
+ if isinstance(error, driver.CUresult):
40
+ err, name = driver.cuGetErrorName(error)
41
+ return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
42
+ elif isinstance(error, nvrtc.nvrtcResult):
43
+ return nvrtc.nvrtcGetErrorString(error)[1]
44
+ else:
45
+ raise RuntimeError("Unknown error type: {}".format(error))
46
+
47
+
48
+ def checkCudaErrors(result):
49
+ if result[0].value:
50
+ raise RuntimeError(
51
+ "CUDA error code={}({})".format(
52
+ result[0].value, _cudaGetErrorEnum(result[0])
53
+ )
54
+ )
55
+ if len(result) == 1:
56
+ return None
57
+ elif len(result) == 2:
58
+ return result[1]
59
+ else:
60
+ return result[1:]
61
+
62
+
63
+ def div_up(a, b):
64
+ return (a + b - 1) // b