cuslines 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cuslines/__init__.py +13 -0
- cuslines/cuda_c/boot.cu +1066 -0
- cuslines/cuda_c/cudamacro.h +86 -0
- cuslines/cuda_c/cuwsort.cuh +171 -0
- cuslines/cuda_c/disc.h +1886 -0
- cuslines/cuda_c/generate_streamlines_cuda.cu +695 -0
- cuslines/cuda_c/globals.h +103 -0
- cuslines/cuda_c/ptt.cu +559 -0
- cuslines/cuda_c/ptt.cuh +47 -0
- cuslines/cuda_c/tracking_helpers.cu +290 -0
- cuslines/cuda_c/utils.cu +138 -0
- cuslines/cuda_python/__init__.py +13 -0
- cuslines/cuda_python/_globals.py +10 -0
- cuslines/cuda_python/cu_direction_getters.py +472 -0
- cuslines/cuda_python/cu_propagate_seeds.py +259 -0
- cuslines/cuda_python/cu_tractography.py +315 -0
- cuslines/cuda_python/cutils.py +64 -0
- cuslines-2.0.0.dist-info/METADATA +90 -0
- cuslines-2.0.0.dist-info/RECORD +22 -0
- cuslines-2.0.0.dist-info/WHEEL +5 -0
- cuslines-2.0.0.dist-info/licenses/LICENSE +26 -0
- cuslines-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,695 @@
|
|
|
1
|
+
/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
*
|
|
3
|
+
* Redistribution and use in source and binary forms, with or without
|
|
4
|
+
* modification, are permitted provided that the following conditions are met:
|
|
5
|
+
*
|
|
6
|
+
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
7
|
+
* list of conditions and the following disclaimer.
|
|
8
|
+
*
|
|
9
|
+
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
10
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
11
|
+
* and/or other materials provided with the distribution.
|
|
12
|
+
*
|
|
13
|
+
* 3. Neither the name of the copyright holder nor the names of its
|
|
14
|
+
* contributors may be used to endorse or promote products derived from
|
|
15
|
+
* this software without specific prior written permission.
|
|
16
|
+
*
|
|
17
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
18
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
19
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
20
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
21
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
22
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
23
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
24
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
25
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
26
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
#include <cuda_runtime.h>
|
|
30
|
+
#include <curand_kernel.h>
|
|
31
|
+
|
|
32
|
+
#include "globals.h"
|
|
33
|
+
#include "cuwsort.cuh"
|
|
34
|
+
#include "ptt.cuh"
|
|
35
|
+
|
|
36
|
+
#include "utils.cu"
|
|
37
|
+
#include "tracking_helpers.cu"
|
|
38
|
+
#include "boot.cu"
|
|
39
|
+
#include "ptt.cu"
|
|
40
|
+
|
|
41
|
+
#define MAX_NUM_DIR (128)
|
|
42
|
+
|
|
43
|
+
#define NTHR_GEN (128)
|
|
44
|
+
|
|
45
|
+
#define MAX_DIMS (8)
|
|
46
|
+
#define MAX_STR_LEN (256)
|
|
47
|
+
|
|
48
|
+
template<int BDIM_X,
|
|
49
|
+
int BDIM_Y,
|
|
50
|
+
bool IS_START,
|
|
51
|
+
typename REAL_T,
|
|
52
|
+
typename REAL3_T>
|
|
53
|
+
__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
|
|
54
|
+
const REAL_T *__restrict__ pmf,
|
|
55
|
+
const REAL_T max_angle,
|
|
56
|
+
const REAL_T relative_peak_thres,
|
|
57
|
+
const REAL_T min_separation_angle,
|
|
58
|
+
REAL3_T dir,
|
|
59
|
+
const int dimx,
|
|
60
|
+
const int dimy,
|
|
61
|
+
const int dimz,
|
|
62
|
+
const int dimt,
|
|
63
|
+
const REAL3_T point,
|
|
64
|
+
const REAL3_T *__restrict__ sphere_vertices,
|
|
65
|
+
const int2 *__restrict__ sphere_edges,
|
|
66
|
+
const int num_edges,
|
|
67
|
+
REAL3_T *__restrict__ dirs) {
|
|
68
|
+
const int tidx = threadIdx.x;
|
|
69
|
+
const int tidy = threadIdx.y;
|
|
70
|
+
|
|
71
|
+
const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
|
|
72
|
+
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
|
|
73
|
+
|
|
74
|
+
const int n32dimt = ((dimt+31)/32)*32;
|
|
75
|
+
|
|
76
|
+
extern __shared__ REAL_T __sh[];
|
|
77
|
+
REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;
|
|
78
|
+
|
|
79
|
+
// pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
|
|
80
|
+
__syncwarp(WMASK);
|
|
81
|
+
const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh);
|
|
82
|
+
__syncwarp(WMASK);
|
|
83
|
+
if (rv != 0) {
|
|
84
|
+
return 0;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// for i in range(_len):
|
|
88
|
+
// if pmf[i] > max_pmf:
|
|
89
|
+
// max_pmf = pmf[i]
|
|
90
|
+
// absolute_pmf_threshold = pmf_threshold * max_pmf
|
|
91
|
+
const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
|
|
92
|
+
__syncwarp(WMASK);
|
|
93
|
+
|
|
94
|
+
// for i in range(_len):
|
|
95
|
+
// if pmf[i] < absolute_pmf_threshold:
|
|
96
|
+
// pmf[i] = 0.0
|
|
97
|
+
#pragma unroll
|
|
98
|
+
for(int i = tidx; i < dimt; i += BDIM_X) {
|
|
99
|
+
if (__pmf_data_sh[i] < absolpmf_thresh) {
|
|
100
|
+
__pmf_data_sh[i] = 0.0;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
__syncwarp(WMASK);
|
|
104
|
+
|
|
105
|
+
if (IS_START) {
|
|
106
|
+
int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
|
|
107
|
+
return peak_directions_d<BDIM_X,
|
|
108
|
+
BDIM_Y>(__pmf_data_sh,
|
|
109
|
+
dirs,
|
|
110
|
+
sphere_vertices,
|
|
111
|
+
sphere_edges,
|
|
112
|
+
num_edges,
|
|
113
|
+
dimt,
|
|
114
|
+
__shInd,
|
|
115
|
+
relative_peak_thres,
|
|
116
|
+
min_separation_angle);
|
|
117
|
+
} else {
|
|
118
|
+
REAL_T __tmp;
|
|
119
|
+
#ifdef DEBUG
|
|
120
|
+
__syncwarp(WMASK);
|
|
121
|
+
if (tidx == 0) {
|
|
122
|
+
printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh);
|
|
123
|
+
printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
|
|
124
|
+
printf("---> dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
|
|
125
|
+
printf("---> point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
|
|
126
|
+
}
|
|
127
|
+
__syncwarp(WMASK);
|
|
128
|
+
if (tidx == 15) {
|
|
129
|
+
printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh);
|
|
130
|
+
printf("---> dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z);
|
|
131
|
+
printf("---> point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z);
|
|
132
|
+
}
|
|
133
|
+
__syncwarp(WMASK);
|
|
134
|
+
if (tidx == 31) {
|
|
135
|
+
printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh);
|
|
136
|
+
printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh);
|
|
137
|
+
printf("---> dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z);
|
|
138
|
+
printf("---> point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z);
|
|
139
|
+
}
|
|
140
|
+
__syncwarp(WMASK);
|
|
141
|
+
#endif
|
|
142
|
+
|
|
143
|
+
// // These should not be relevant
|
|
144
|
+
// if norm(&direction[0]) == 0:
|
|
145
|
+
// return 1
|
|
146
|
+
// normalize(&direction[0])
|
|
147
|
+
|
|
148
|
+
// for i in range(_len):
|
|
149
|
+
// cos_sim = self.vertices[i][0] * direction[0] \
|
|
150
|
+
// + self.vertices[i][1] * direction[1] \
|
|
151
|
+
// + self.vertices[i][2] * direction[2]
|
|
152
|
+
// if cos_sim < 0:
|
|
153
|
+
// cos_sim = cos_sim * -1
|
|
154
|
+
// if cos_sim < self.cos_similarity:
|
|
155
|
+
// pmf[i] = 0
|
|
156
|
+
const REAL_T cos_similarity = COS(max_angle);
|
|
157
|
+
|
|
158
|
+
#pragma unroll
|
|
159
|
+
for(int i = tidx; i < dimt; i += BDIM_X) {
|
|
160
|
+
const REAL_T dot = dir.x*sphere_vertices[i].x+
|
|
161
|
+
dir.y*sphere_vertices[i].y+
|
|
162
|
+
dir.z*sphere_vertices[i].z;
|
|
163
|
+
|
|
164
|
+
if (FABS(dot) < cos_similarity) {
|
|
165
|
+
__pmf_data_sh[i] = 0.0;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
__syncwarp(WMASK);
|
|
169
|
+
|
|
170
|
+
#ifdef DEBUG
|
|
171
|
+
__syncwarp(WMASK);
|
|
172
|
+
if (tidx == 0) {
|
|
173
|
+
printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh);
|
|
174
|
+
}
|
|
175
|
+
__syncwarp(WMASK);
|
|
176
|
+
#endif
|
|
177
|
+
|
|
178
|
+
// cumsum(pmf, pmf, _len)
|
|
179
|
+
prefix_sum_sh_d<BDIM_X>(__pmf_data_sh, dimt);
|
|
180
|
+
|
|
181
|
+
#ifdef DEBUG
|
|
182
|
+
__syncwarp(WMASK);
|
|
183
|
+
if (tidx == 0) {
|
|
184
|
+
printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh);
|
|
185
|
+
}
|
|
186
|
+
__syncwarp(WMASK);
|
|
187
|
+
#endif
|
|
188
|
+
|
|
189
|
+
// last_cdf = pmf[_len - 1]
|
|
190
|
+
// if last_cdf == 0:
|
|
191
|
+
// return 1
|
|
192
|
+
REAL_T last_cdf = __pmf_data_sh[dimt - 1];
|
|
193
|
+
if (last_cdf == 0) {
|
|
194
|
+
return 0;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// idx = where_to_insert(pmf, random() * last_cdf, _len)
|
|
198
|
+
if (tidx == 0) {
|
|
199
|
+
__tmp = curand_uniform(st) * last_cdf;
|
|
200
|
+
}
|
|
201
|
+
REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
|
|
202
|
+
// Both these implementations work
|
|
203
|
+
#if 1
|
|
204
|
+
int low = 0;
|
|
205
|
+
int high = dimt - 1;
|
|
206
|
+
while ((high - low) >= BDIM_X) {
|
|
207
|
+
const int mid = (low + high) / 2;
|
|
208
|
+
if (__pmf_data_sh[mid] < selected_cdf) {
|
|
209
|
+
low = mid;
|
|
210
|
+
} else {
|
|
211
|
+
high = mid;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0;
|
|
215
|
+
const int __msk = __ballot_sync(WMASK, __ballot);
|
|
216
|
+
const int indProb = low + __ffs(__msk) - 1;
|
|
217
|
+
#else
|
|
218
|
+
int indProb = dimt - 1;
|
|
219
|
+
for (int ii = 0; ii < dimt; ii+=BDIM_X) {
|
|
220
|
+
int __is_greater = 0;
|
|
221
|
+
if (ii+tidx < dimt) {
|
|
222
|
+
__is_greater = selected_cdf < __pmf_data_sh[ii+tidx];
|
|
223
|
+
}
|
|
224
|
+
const int __msk = __ballot_sync(WMASK, __is_greater);
|
|
225
|
+
if (__msk != 0) {
|
|
226
|
+
indProb = ii + __ffs(__msk) - 1;
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
#endif
|
|
231
|
+
|
|
232
|
+
#ifdef DEBUG
|
|
233
|
+
__syncwarp(WMASK);
|
|
234
|
+
if (tidx == 0) {
|
|
235
|
+
printf("last_cdf %10.8f\n", last_cdf);
|
|
236
|
+
printf("selected_cdf %10.8f\n", selected_cdf);
|
|
237
|
+
printf("indProb %i out of %i\n", indProb, dimt);
|
|
238
|
+
}
|
|
239
|
+
__syncwarp(WMASK);
|
|
240
|
+
#endif
|
|
241
|
+
|
|
242
|
+
// newdir = self.vertices[idx]
|
|
243
|
+
// if (direction[0] * newdir[0]
|
|
244
|
+
// + direction[1] * newdir[1]
|
|
245
|
+
// + direction[2] * newdir[2] > 0):
|
|
246
|
+
// copy_point(&newdir[0], &direction[0])
|
|
247
|
+
// else:
|
|
248
|
+
// newdir[0] = newdir[0] * -1
|
|
249
|
+
// newdir[1] = newdir[1] * -1
|
|
250
|
+
// newdir[2] = newdir[2] * -1
|
|
251
|
+
// copy_point(&newdir[0], &direction[0])
|
|
252
|
+
if (tidx == 0) {
|
|
253
|
+
if ((dir.x * sphere_vertices[indProb].x +
|
|
254
|
+
dir.y * sphere_vertices[indProb].y +
|
|
255
|
+
dir.z * sphere_vertices[indProb].z) > 0) {
|
|
256
|
+
*dirs = MAKE_REAL3(sphere_vertices[indProb].x,
|
|
257
|
+
sphere_vertices[indProb].y,
|
|
258
|
+
sphere_vertices[indProb].z);
|
|
259
|
+
} else {
|
|
260
|
+
*dirs = MAKE_REAL3(-sphere_vertices[indProb].x,
|
|
261
|
+
-sphere_vertices[indProb].y,
|
|
262
|
+
-sphere_vertices[indProb].z);
|
|
263
|
+
}
|
|
264
|
+
// printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
#ifdef DEBUG
|
|
268
|
+
__syncwarp(WMASK);
|
|
269
|
+
if (tidx == 0) {
|
|
270
|
+
printf("last_cdf %10.8f\n", last_cdf);
|
|
271
|
+
printf("selected_cdf %10.8f\n", selected_cdf);
|
|
272
|
+
printf("indProb %i out of %i\n", indProb, dimt);
|
|
273
|
+
}
|
|
274
|
+
__syncwarp(WMASK);
|
|
275
|
+
if (tidx == 15) {
|
|
276
|
+
printf("last_cdf %10.8f l15\n", last_cdf);
|
|
277
|
+
printf("selected_cdf %10.8f l15\n", selected_cdf);
|
|
278
|
+
printf("indProb %i out of %i l15\n", indProb, dimt);
|
|
279
|
+
}
|
|
280
|
+
__syncwarp(WMASK);
|
|
281
|
+
if (tidx == 31) {
|
|
282
|
+
printf("last_cdf %10.8f l31\n", last_cdf);
|
|
283
|
+
printf("selected_cdf %10.8f l31\n", selected_cdf);
|
|
284
|
+
printf("indProb %i out of %i l31\n", indProb, dimt);
|
|
285
|
+
}
|
|
286
|
+
__syncwarp(WMASK);
|
|
287
|
+
#endif
|
|
288
|
+
return 1;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
template<int BDIM_X,
|
|
293
|
+
int BDIM_Y,
|
|
294
|
+
ModelType MODEL_T,
|
|
295
|
+
typename REAL_T,
|
|
296
|
+
typename REAL3_T>
|
|
297
|
+
__device__ int tracker_d(curandStatePhilox4_32_10_t *st,
|
|
298
|
+
const REAL_T max_angle,
|
|
299
|
+
const REAL_T tc_threshold,
|
|
300
|
+
const REAL_T step_size,
|
|
301
|
+
const REAL_T relative_peak_thres,
|
|
302
|
+
const REAL_T min_separation_angle,
|
|
303
|
+
REAL3_T seed,
|
|
304
|
+
REAL3_T first_step,
|
|
305
|
+
REAL_T* ptt_frame,
|
|
306
|
+
REAL3_T voxel_size,
|
|
307
|
+
const int dimx,
|
|
308
|
+
const int dimy,
|
|
309
|
+
const int dimz,
|
|
310
|
+
const int dimt,
|
|
311
|
+
const REAL_T *__restrict__ dataf,
|
|
312
|
+
const REAL_T *__restrict__ metric_map,
|
|
313
|
+
const int samplm_nr,
|
|
314
|
+
const REAL3_T *__restrict__ sphere_vertices,
|
|
315
|
+
const int2 *__restrict__ sphere_edges,
|
|
316
|
+
const int num_edges,
|
|
317
|
+
int *__restrict__ nsteps,
|
|
318
|
+
REAL3_T *__restrict__ streamline) {
|
|
319
|
+
|
|
320
|
+
const int tidx = threadIdx.x;
|
|
321
|
+
const int tidy = threadIdx.y;
|
|
322
|
+
|
|
323
|
+
const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
|
|
324
|
+
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
|
|
325
|
+
|
|
326
|
+
int tissue_class = TRACKPOINT;
|
|
327
|
+
|
|
328
|
+
REAL3_T point = seed;
|
|
329
|
+
REAL3_T direction = first_step;
|
|
330
|
+
__shared__ REAL3_T __sh_new_dir[BDIM_Y];
|
|
331
|
+
|
|
332
|
+
if (tidx == 0) {
|
|
333
|
+
streamline[0] = point;
|
|
334
|
+
}
|
|
335
|
+
__syncwarp(WMASK);
|
|
336
|
+
|
|
337
|
+
int step_frac;
|
|
338
|
+
if (MODEL_T == PTT) {
|
|
339
|
+
step_frac = STEP_FRAC;
|
|
340
|
+
} else {
|
|
341
|
+
step_frac = 1; // STEP_FRAC could be useful in other models
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
int i;
|
|
345
|
+
for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
|
|
346
|
+
int ndir;
|
|
347
|
+
if constexpr (MODEL_T == PROB) {
|
|
348
|
+
ndir = get_direction_prob_d<BDIM_X,
|
|
349
|
+
BDIM_Y,
|
|
350
|
+
0>(
|
|
351
|
+
st,
|
|
352
|
+
dataf,
|
|
353
|
+
max_angle,
|
|
354
|
+
relative_peak_thres,
|
|
355
|
+
min_separation_angle,
|
|
356
|
+
direction,
|
|
357
|
+
dimx, dimy, dimz, dimt,
|
|
358
|
+
point,
|
|
359
|
+
sphere_vertices,
|
|
360
|
+
sphere_edges,
|
|
361
|
+
num_edges,
|
|
362
|
+
__sh_new_dir + tidy);
|
|
363
|
+
} else if constexpr (MODEL_T == PTT) {
|
|
364
|
+
ndir = get_direction_ptt_d<BDIM_X,
|
|
365
|
+
BDIM_Y,
|
|
366
|
+
0>(
|
|
367
|
+
st,
|
|
368
|
+
dataf,
|
|
369
|
+
max_angle,
|
|
370
|
+
step_size,
|
|
371
|
+
direction,
|
|
372
|
+
ptt_frame,
|
|
373
|
+
dimx, dimy, dimz, dimt,
|
|
374
|
+
point,
|
|
375
|
+
sphere_vertices,
|
|
376
|
+
__sh_new_dir + tidy);
|
|
377
|
+
}
|
|
378
|
+
__syncwarp(WMASK);
|
|
379
|
+
direction = __sh_new_dir[tidy];
|
|
380
|
+
__syncwarp(WMASK);
|
|
381
|
+
|
|
382
|
+
if (ndir == 0) {
|
|
383
|
+
break;
|
|
384
|
+
}
|
|
385
|
+
#if 0
|
|
386
|
+
if (threadIdx.y == 1 && threadIdx.x == 0) {
|
|
387
|
+
printf("tracker: i: %d, direction: (%f, %f, %f)\n", i, direction.x, direction.y, direction.z);
|
|
388
|
+
}
|
|
389
|
+
//return;
|
|
390
|
+
#endif
|
|
391
|
+
|
|
392
|
+
point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
|
|
393
|
+
point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
|
|
394
|
+
point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
|
|
395
|
+
|
|
396
|
+
if ((tidx == 0) && ((i % step_frac) == 0)){
|
|
397
|
+
streamline[i/step_frac] = point;
|
|
398
|
+
#if 0
|
|
399
|
+
if (threadIdx.y == 1) {
|
|
400
|
+
printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z);
|
|
401
|
+
}
|
|
402
|
+
#endif
|
|
403
|
+
}
|
|
404
|
+
__syncwarp(WMASK);
|
|
405
|
+
|
|
406
|
+
tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
|
|
407
|
+
|
|
408
|
+
#if 0
|
|
409
|
+
__syncwarp(WMASK);
|
|
410
|
+
if (tidx == 0) {
|
|
411
|
+
printf("step_size %f\n", step_size);
|
|
412
|
+
printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z);
|
|
413
|
+
printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
|
|
414
|
+
printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z);
|
|
415
|
+
printf("point %f, %f, %f\n", point.x, point.y, point.z);
|
|
416
|
+
printf("tc %i\n", tissue_class);
|
|
417
|
+
}
|
|
418
|
+
__syncwarp(WMASK);
|
|
419
|
+
if (tidx == 15) {
|
|
420
|
+
printf("step_size %f l15\n", step_size);
|
|
421
|
+
printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z);
|
|
422
|
+
printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
|
|
423
|
+
printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z);
|
|
424
|
+
printf("point %f, %f, %f l15\n", point.x, point.y, point.z);
|
|
425
|
+
printf("tc %i l15\n", tissue_class);
|
|
426
|
+
}
|
|
427
|
+
__syncwarp(WMASK);
|
|
428
|
+
if (tidx == 31) {
|
|
429
|
+
printf("step_size %f l31\n", step_size);
|
|
430
|
+
printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z);
|
|
431
|
+
printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
|
|
432
|
+
printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z);
|
|
433
|
+
printf("point %f, %f, %f l31\n", point.x, point.y, point.z);
|
|
434
|
+
printf("tc %i l31\n", tissue_class);
|
|
435
|
+
}
|
|
436
|
+
__syncwarp(WMASK);
|
|
437
|
+
#endif
|
|
438
|
+
|
|
439
|
+
if (tissue_class == ENDPOINT ||
|
|
440
|
+
tissue_class == INVALIDPOINT ||
|
|
441
|
+
tissue_class == OUTSIDEIMAGE) {
|
|
442
|
+
break;
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
nsteps[0] = i/step_frac;
|
|
446
|
+
if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
|
|
447
|
+
nsteps[0]++;
|
|
448
|
+
if (tidx == 0) {
|
|
449
|
+
streamline[nsteps[0]] = point;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
return tissue_class;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
template<int BDIM_X,
|
|
457
|
+
int BDIM_Y,
|
|
458
|
+
typename REAL_T,
|
|
459
|
+
typename REAL3_T>
|
|
460
|
+
__global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
|
|
461
|
+
const REAL_T relative_peak_thres,
|
|
462
|
+
const REAL_T min_separation_angle,
|
|
463
|
+
const long long rndSeed,
|
|
464
|
+
const int nseed,
|
|
465
|
+
const REAL3_T *__restrict__ seeds,
|
|
466
|
+
const int dimx,
|
|
467
|
+
const int dimy,
|
|
468
|
+
const int dimz,
|
|
469
|
+
const int dimt,
|
|
470
|
+
const REAL_T *__restrict__ dataf,
|
|
471
|
+
const REAL3_T *__restrict__ sphere_vertices,
|
|
472
|
+
const int2 *__restrict__ sphere_edges,
|
|
473
|
+
const int num_edges,
|
|
474
|
+
REAL3_T *__restrict__ shDir0,
|
|
475
|
+
int *slineOutOff) {
|
|
476
|
+
|
|
477
|
+
const int tidx = threadIdx.x;
|
|
478
|
+
const int slid = blockIdx.x*blockDim.y + threadIdx.y;
|
|
479
|
+
const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
|
|
480
|
+
|
|
481
|
+
if (slid >= nseed) {
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
|
|
486
|
+
curandStatePhilox4_32_10_t st;
|
|
487
|
+
curand_init(rndSeed, gid, 0, &st);
|
|
488
|
+
|
|
489
|
+
int ndir = get_direction_prob_d<BDIM_X,
|
|
490
|
+
BDIM_Y,
|
|
491
|
+
1>(
|
|
492
|
+
&st,
|
|
493
|
+
dataf,
|
|
494
|
+
max_angle,
|
|
495
|
+
relative_peak_thres,
|
|
496
|
+
min_separation_angle,
|
|
497
|
+
MAKE_REAL3(0,0,0),
|
|
498
|
+
dimx, dimy, dimz, dimt,
|
|
499
|
+
seeds[slid],
|
|
500
|
+
sphere_vertices,
|
|
501
|
+
sphere_edges,
|
|
502
|
+
num_edges,
|
|
503
|
+
__shDir);
|
|
504
|
+
if (tidx == 0) {
|
|
505
|
+
slineOutOff[slid] = ndir;
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
return;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
template<int BDIM_X,
|
|
512
|
+
int BDIM_Y,
|
|
513
|
+
ModelType MODEL_T,
|
|
514
|
+
typename REAL_T,
|
|
515
|
+
typename REAL3_T>
|
|
516
|
+
__global__ void genStreamlinesMergeProb_k(
|
|
517
|
+
const REAL_T max_angle,
|
|
518
|
+
const REAL_T tc_threshold,
|
|
519
|
+
const REAL_T step_size,
|
|
520
|
+
const REAL_T relative_peak_thres,
|
|
521
|
+
const REAL_T min_separation_angle,
|
|
522
|
+
const long long rndSeed,
|
|
523
|
+
const int rndOffset,
|
|
524
|
+
const int nseed,
|
|
525
|
+
const REAL3_T *__restrict__ seeds,
|
|
526
|
+
const int dimx,
|
|
527
|
+
const int dimy,
|
|
528
|
+
const int dimz,
|
|
529
|
+
const int dimt,
|
|
530
|
+
const REAL_T *__restrict__ dataf,
|
|
531
|
+
const REAL_T *__restrict__ metric_map,
|
|
532
|
+
const int samplm_nr,
|
|
533
|
+
const REAL3_T *__restrict__ sphere_vertices,
|
|
534
|
+
const int2 *__restrict__ sphere_edges,
|
|
535
|
+
const int num_edges,
|
|
536
|
+
const int *__restrict__ slineOutOff,
|
|
537
|
+
REAL3_T *__restrict__ shDir0,
|
|
538
|
+
int *__restrict__ slineSeed,
|
|
539
|
+
int *__restrict__ slineLen,
|
|
540
|
+
REAL3_T *__restrict__ sline) {
|
|
541
|
+
|
|
542
|
+
const int tidx = threadIdx.x;
|
|
543
|
+
const int tidy = threadIdx.y;
|
|
544
|
+
|
|
545
|
+
const int slid = blockIdx.x*blockDim.y + threadIdx.y;
|
|
546
|
+
|
|
547
|
+
const int lid = (tidy*BDIM_X + tidx) % 32;
|
|
548
|
+
const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
|
|
549
|
+
|
|
550
|
+
__shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases
|
|
551
|
+
REAL_T* __ptt_frame = frame_sh + tidy*18;
|
|
552
|
+
// const int hr_side = dimt-1;
|
|
553
|
+
|
|
554
|
+
curandStatePhilox4_32_10_t st;
|
|
555
|
+
// const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
|
|
556
|
+
const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
|
|
557
|
+
//curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
|
|
558
|
+
curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
|
|
559
|
+
// elements of the same sequence
|
|
560
|
+
if (slid >= nseed) {
|
|
561
|
+
return;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
REAL3_T seed = seeds[slid];
|
|
565
|
+
|
|
566
|
+
int ndir = slineOutOff[slid+1]-slineOutOff[slid];
|
|
567
|
+
#if 0
|
|
568
|
+
if (threadIdx.y == 0 && threadIdx.x == 0) {
|
|
569
|
+
printf("%s: ndir: %d\n", __func__, ndir);
|
|
570
|
+
for(int i = 0; i < ndir; i++) {
|
|
571
|
+
printf("__shDir[%d][%d]: (%f, %f, %f)\n",
|
|
572
|
+
tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
#endif
|
|
576
|
+
__syncwarp(WMASK);
|
|
577
|
+
|
|
578
|
+
int slineOff = slineOutOff[slid];
|
|
579
|
+
|
|
580
|
+
for(int i = 0; i < ndir; i++) {
|
|
581
|
+
REAL3_T first_step = shDir0[slid*samplm_nr + i];
|
|
582
|
+
|
|
583
|
+
REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
|
|
584
|
+
|
|
585
|
+
if (tidx == 0) {
|
|
586
|
+
slineSeed[slineOff] = slid;
|
|
587
|
+
}
|
|
588
|
+
#if 0
|
|
589
|
+
if (threadIdx.y == 0 && threadIdx.x == 0) {
|
|
590
|
+
printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
|
|
591
|
+
}
|
|
592
|
+
#endif
|
|
593
|
+
|
|
594
|
+
if (MODEL_T == PTT) {
|
|
595
|
+
if (!init_frame_ptt_d<BDIM_X, BDIM_Y>(
|
|
596
|
+
&st,
|
|
597
|
+
dataf,
|
|
598
|
+
max_angle,
|
|
599
|
+
step_size,
|
|
600
|
+
first_step,
|
|
601
|
+
dimx, dimy, dimz, dimt,
|
|
602
|
+
seed,
|
|
603
|
+
sphere_vertices,
|
|
604
|
+
__ptt_frame
|
|
605
|
+
)) { // this fails rarely
|
|
606
|
+
if (tidx == 0) {
|
|
607
|
+
slineLen[slineOff] = 1;
|
|
608
|
+
currSline[0] = seed;
|
|
609
|
+
}
|
|
610
|
+
__syncwarp(WMASK);
|
|
611
|
+
slineOff += 1;
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
int stepsB;
|
|
618
|
+
const int tissue_classB = tracker_d<BDIM_X,
|
|
619
|
+
BDIM_Y,
|
|
620
|
+
MODEL_T>(&st,
|
|
621
|
+
max_angle,
|
|
622
|
+
tc_threshold,
|
|
623
|
+
step_size,
|
|
624
|
+
relative_peak_thres,
|
|
625
|
+
min_separation_angle,
|
|
626
|
+
seed,
|
|
627
|
+
MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
|
|
628
|
+
__ptt_frame,
|
|
629
|
+
MAKE_REAL3(1, 1, 1),
|
|
630
|
+
dimx, dimy, dimz, dimt, dataf,
|
|
631
|
+
metric_map,
|
|
632
|
+
samplm_nr,
|
|
633
|
+
sphere_vertices,
|
|
634
|
+
sphere_edges,
|
|
635
|
+
num_edges,
|
|
636
|
+
&stepsB,
|
|
637
|
+
currSline);
|
|
638
|
+
//if (tidx == 0) {
|
|
639
|
+
// slineLenB[slineOff] = stepsB;
|
|
640
|
+
//}
|
|
641
|
+
|
|
642
|
+
// reverse backward sline
|
|
643
|
+
for(int j = 0; j < stepsB/2; j += BDIM_X) {
|
|
644
|
+
if (j+tidx < stepsB/2) {
|
|
645
|
+
const REAL3_T __p = currSline[j+tidx];
|
|
646
|
+
currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
|
|
647
|
+
currSline[stepsB-1 - (j+tidx)] = __p;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
int stepsF;
|
|
652
|
+
const int tissue_classF = tracker_d<BDIM_X,
|
|
653
|
+
BDIM_Y,
|
|
654
|
+
MODEL_T>(&st,
|
|
655
|
+
max_angle,
|
|
656
|
+
tc_threshold,
|
|
657
|
+
step_size,
|
|
658
|
+
relative_peak_thres,
|
|
659
|
+
min_separation_angle,
|
|
660
|
+
seed,
|
|
661
|
+
first_step,
|
|
662
|
+
__ptt_frame + 9,
|
|
663
|
+
MAKE_REAL3(1, 1, 1),
|
|
664
|
+
dimx, dimy, dimz, dimt, dataf,
|
|
665
|
+
metric_map,
|
|
666
|
+
samplm_nr,
|
|
667
|
+
sphere_vertices,
|
|
668
|
+
sphere_edges,
|
|
669
|
+
num_edges,
|
|
670
|
+
&stepsF,
|
|
671
|
+
currSline + stepsB-1);
|
|
672
|
+
if (tidx == 0) {
|
|
673
|
+
slineLen[slineOff] = stepsB-1+stepsF;
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
slineOff += 1;
|
|
677
|
+
#if 0
|
|
678
|
+
if (threadIdx.y == 0 && threadIdx.x == 0) {
|
|
679
|
+
printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF);
|
|
680
|
+
}
|
|
681
|
+
__syncwarp(WMASK);
|
|
682
|
+
#endif
|
|
683
|
+
//if (/* !return_all || */0 &&
|
|
684
|
+
// tissue_classF != ENDPOINT &&
|
|
685
|
+
// tissue_classF != OUTSIDEIMAGE) {
|
|
686
|
+
// continue;
|
|
687
|
+
//}
|
|
688
|
+
//if (/* !return_all || */ 0 &&
|
|
689
|
+
// tissue_classB != ENDPOINT &&
|
|
690
|
+
// tissue_classB != OUTSIDEIMAGE) {
|
|
691
|
+
// continue;
|
|
692
|
+
//}
|
|
693
|
+
}
|
|
694
|
+
return;
|
|
695
|
+
}
|