httomolibgpu 2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- httomolibgpu/__init__.py +14 -0
- httomolibgpu/cuda_kernels/__init__.py +29 -0
- httomolibgpu/cuda_kernels/calc_metrics.cu +334 -0
- httomolibgpu/cuda_kernels/center_360_shifts.cu +49 -0
- httomolibgpu/cuda_kernels/downsample_sino.cu +36 -0
- httomolibgpu/cuda_kernels/generate_mask.cu +51 -0
- httomolibgpu/cuda_kernels/median_kernel.cu +54 -0
- httomolibgpu/cuda_kernels/paganin_filter_gen.cu +37 -0
- httomolibgpu/cupywrapper.py +19 -0
- httomolibgpu/misc/__init__.py +0 -0
- httomolibgpu/misc/corr.py +161 -0
- httomolibgpu/misc/morph.py +240 -0
- httomolibgpu/misc/rescale.py +160 -0
- httomolibgpu/prep/__init__.py +0 -0
- httomolibgpu/prep/alignment.py +218 -0
- httomolibgpu/prep/normalize.py +156 -0
- httomolibgpu/prep/phase.py +436 -0
- httomolibgpu/prep/stripe.py +417 -0
- httomolibgpu/recon/__init__.py +0 -0
- httomolibgpu/recon/algorithm.py +425 -0
- httomolibgpu/recon/rotation.py +793 -0
- httomolibgpu-2.1.dist-info/LICENSE +902 -0
- httomolibgpu-2.1.dist-info/METADATA +73 -0
- httomolibgpu-2.1.dist-info/RECORD +26 -0
- httomolibgpu-2.1.dist-info/WHEEL +5 -0
- httomolibgpu-2.1.dist-info/top_level.txt +1 -0
httomolibgpu/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from httomolibgpu.misc.corr import median_filter, remove_outlier
|
|
2
|
+
from httomolibgpu.misc.morph import sino_360_to_180, data_resampler
|
|
3
|
+
from httomolibgpu.misc.rescale import rescale_to_int
|
|
4
|
+
from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
|
|
5
|
+
from httomolibgpu.prep.normalize import normalize
|
|
6
|
+
from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
|
|
7
|
+
from httomolibgpu.prep.stripe import (
|
|
8
|
+
remove_stripe_based_sorting,
|
|
9
|
+
remove_stripe_ti,
|
|
10
|
+
remove_all_stripe,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from httomolibgpu.recon.algorithm import FBP, LPRec, SIRT, CGLS
|
|
14
|
+
from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from httomolibgpu import cupywrapper
|
|
5
|
+
|
|
6
|
+
cp = cupywrapper.cp
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_cuda_module(
|
|
10
|
+
file: str,
|
|
11
|
+
name_expressions: Optional[List[str]] = None,
|
|
12
|
+
options: Tuple[str, ...] = tuple(),
|
|
13
|
+
) -> cp.RawModule:
|
|
14
|
+
"""Load a CUDA module file, i.e. a .cu file, from the file system,
|
|
15
|
+
compile it, and return is as a CuPy RawModule for further
|
|
16
|
+
processing.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
dir = os.path.dirname(os.path.abspath(__file__))
|
|
20
|
+
file = os.path.join(dir, file + ".cu")
|
|
21
|
+
# insert a preprocessor line directive to assist compiler errors (so line numbers show correctly in output)
|
|
22
|
+
escaped = file.replace("\\", "\\\\")
|
|
23
|
+
code = '#line 1 "{}"\n'.format(escaped)
|
|
24
|
+
with open(file, "r") as f:
|
|
25
|
+
code += f.read()
|
|
26
|
+
|
|
27
|
+
return cp.RawModule(
|
|
28
|
+
options=("-std=c++11", *options), code=code, name_expressions=name_expressions
|
|
29
|
+
)
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Calculate correlation-based metrics for the find_center_360 method
|
|
3
|
+
*********************************************************************
|
|
4
|
+
*
|
|
5
|
+
* The core of the find_center_360 method is calculating correlation coefficients
|
|
6
|
+
* between 2 or 3 shifted matrices, over many shifting positions.
|
|
7
|
+
*
|
|
8
|
+
* This file has cuda kernels for this purpose, which provide speedups of > 300x
|
|
9
|
+
* compared to using a straight numpy to cupy port.
|
|
10
|
+
*
|
|
11
|
+
* The key is the formula to calculate the Peason correlation coefficient.
|
|
12
|
+
* This is calculated manually for every shifted matrix position in the same kernel.
|
|
13
|
+
*
|
|
14
|
+
* The correlation coefficient between two vectors (we flatten the matrices) is:
|
|
15
|
+
*
|
|
16
|
+
* m1_norm = m1 - mean(m1)
|
|
17
|
+
* m2_norm = m2 - mean(m2)
|
|
18
|
+
* m1_sqr = dot(m1_norm, m1_norm)
|
|
19
|
+
* m2_sqr = dot(m2_norm, m2_norm)
|
|
20
|
+
* m1_m2 = dot(m1_norm, m2_norm)
|
|
21
|
+
* r = m1_m2 / sqrt(m1_sqr * m2_sqr)
|
|
22
|
+
*
|
|
23
|
+
* The kernels in the following compute these directly pretty much, taking into
|
|
24
|
+
* consideration normalisation, overlaps, and position offsets. Also note that the
|
|
25
|
+
* version with overlap requries 3 correlation coefficients (between 3 matrices).
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
/** Function to perform a binary sum reduction for N-dimensional array storage.
|
|
30
|
+
* Note that the shared_mem pointer must have space for N * BLOCK_DIM elements.
|
|
31
|
+
*/
|
|
32
|
+
template <int N, int BLOCK_DIM=128>
|
|
33
|
+
__device__ inline
|
|
34
|
+
void sum_reduction_n(float* shared_mem, float v[N]) {
|
|
35
|
+
int tid = threadIdx.x;
|
|
36
|
+
|
|
37
|
+
float *smem[N];
|
|
38
|
+
#pragma unroll
|
|
39
|
+
for (int i = 0; i < N; ++i) {
|
|
40
|
+
smem[i] = shared_mem + i * BLOCK_DIM;
|
|
41
|
+
smem[i][tid] = v[i];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
__syncthreads();
|
|
45
|
+
int nt = BLOCK_DIM;
|
|
46
|
+
int c = nt;
|
|
47
|
+
while (c > 1)
|
|
48
|
+
{
|
|
49
|
+
int half = c / 2;
|
|
50
|
+
if (tid < half)
|
|
51
|
+
{
|
|
52
|
+
#pragma unroll
|
|
53
|
+
for (int i = 0; i < N; ++i) {
|
|
54
|
+
smem[i][tid] += smem[i][c - tid - 1];
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
__syncthreads();
|
|
58
|
+
c = c - half;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// write back
|
|
62
|
+
#pragma unroll
|
|
63
|
+
for (int i = 0; i < N; ++i) {
|
|
64
|
+
v[i] = smem[i][0];
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
inline __device__
|
|
69
|
+
float clip(float x, float min, float max) {
|
|
70
|
+
x = x < -1.0f ? -1.0f : x;
|
|
71
|
+
x = x > 1.0f ? 1.0f : x;
|
|
72
|
+
return x;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
__device__ inline
|
|
76
|
+
float sum_abs_row(const float* row, int win_width)
|
|
77
|
+
{
|
|
78
|
+
float sum_abs = 0.0;
|
|
79
|
+
for (int x = 0; x < win_width; ++x) {
|
|
80
|
+
sum_abs += abs(row[x]);
|
|
81
|
+
}
|
|
82
|
+
return sum_abs;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
/** Compute function without overlap, where correlation metrics with 2 matrices are calculated. */
|
|
87
|
+
template <bool norm>
|
|
88
|
+
__device__ void _calc_metrics_no_overlap(const float *mat1, int mat1_nx,
|
|
89
|
+
const float *mat2, int mat2_nx,
|
|
90
|
+
int win_width, int rows, int side,
|
|
91
|
+
float *list_metric)
|
|
92
|
+
{
|
|
93
|
+
// rows of the matrix
|
|
94
|
+
const int tid = threadIdx.x;
|
|
95
|
+
// position in list_pos
|
|
96
|
+
const int i = blockIdx.y;
|
|
97
|
+
const int npos = gridDim.y;
|
|
98
|
+
|
|
99
|
+
const int pos = win_width / 2 + i;
|
|
100
|
+
|
|
101
|
+
// offset matrices for position
|
|
102
|
+
const float* mat2_roi = side == 1 ? mat2 : mat2 + mat2_nx - win_width;
|
|
103
|
+
const float* mat1_roi = mat1 + (pos - win_width / 2);
|
|
104
|
+
|
|
105
|
+
extern __shared__ float smem[];
|
|
106
|
+
|
|
107
|
+
// we store our data for reductions here
|
|
108
|
+
float v[3];
|
|
109
|
+
|
|
110
|
+
////////////////////////
|
|
111
|
+
// 1. We need the mean of the 2 matrices (flattend)
|
|
112
|
+
v[0] = 0.0f;
|
|
113
|
+
v[1] = 0.0f;
|
|
114
|
+
|
|
115
|
+
for (int y = tid; y < rows; y += blockDim.x)
|
|
116
|
+
{
|
|
117
|
+
float norm_factor = 1.0f;
|
|
118
|
+
if (norm) {
|
|
119
|
+
norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
|
|
120
|
+
sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
|
|
121
|
+
}
|
|
122
|
+
for (int x = 0; x < win_width; ++x)
|
|
123
|
+
{
|
|
124
|
+
v[0] += mat1_roi[y * mat1_nx + x] * norm_factor;
|
|
125
|
+
v[1] += mat2_roi[y * mat2_nx + x];
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// now reduce them to calc the mean
|
|
130
|
+
sum_reduction_n<2>(smem, v);
|
|
131
|
+
|
|
132
|
+
float mean_mat1 = v[0] / rows / win_width;
|
|
133
|
+
float mean_mat2 = v[1] / rows / win_width;
|
|
134
|
+
|
|
135
|
+
///////////////////////////////////
|
|
136
|
+
// 2. Calculate the sum of the dot and cross-products for the 3 matrices:
|
|
137
|
+
v[0] = 0.0f; // dot(mat1, mat1)
|
|
138
|
+
v[1] = 0.0f; // dot(mat2, mat2)
|
|
139
|
+
v[2] = 0.0f; // dot(mat1, mat2)
|
|
140
|
+
|
|
141
|
+
for (int y = tid; y < rows; y += blockDim.x)
|
|
142
|
+
{
|
|
143
|
+
float norm_factor = 1.0f;
|
|
144
|
+
if (norm) {
|
|
145
|
+
norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
|
|
146
|
+
sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
|
|
147
|
+
}
|
|
148
|
+
for (int x = 0; x < win_width; ++x)
|
|
149
|
+
{
|
|
150
|
+
float mat1_roi_val = mat1_roi[y * mat1_nx + x] * norm_factor;
|
|
151
|
+
float mat2_roi_val = mat2_roi[y * mat2_nx + x];
|
|
152
|
+
mat1_roi_val -= mean_mat1;
|
|
153
|
+
mat2_roi_val -= mean_mat2;
|
|
154
|
+
v[0] += mat1_roi_val * mat1_roi_val;
|
|
155
|
+
v[1] += mat2_roi_val * mat2_roi_val;
|
|
156
|
+
v[2] += mat1_roi_val * mat2_roi_val;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// now reduce them to calc the mean
|
|
161
|
+
sum_reduction_n<3>(smem, v);
|
|
162
|
+
|
|
163
|
+
////////////////////////////////
|
|
164
|
+
// 3. Calculate the correlation coefficients from the covariance values
|
|
165
|
+
if (tid == 0)
|
|
166
|
+
{
|
|
167
|
+
// we actually need the mean of the squares
|
|
168
|
+
float mat1_mat1 = v[0] / (rows * win_width - 1);
|
|
169
|
+
float mat2_mat2 = v[1] / (rows * win_width - 1);
|
|
170
|
+
float mat1_mat2 = v[2] / (rows * win_width - 1);
|
|
171
|
+
// not calculate correlation coeffiecient
|
|
172
|
+
float r = mat1_mat2 / sqrt(mat1_mat1 * mat2_mat2);
|
|
173
|
+
r = clip(r, -1.0f, 1.0f);
|
|
174
|
+
// metric
|
|
175
|
+
float metric = abs(1.0f - r);
|
|
176
|
+
list_metric[i] = metric;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
/** Compute function with overlap, where correlation metrics with 3 matrices are calculated. */
|
|
182
|
+
template <bool norm>
|
|
183
|
+
__device__ void _calc_metrics_overlap(const float *mat1, int mat1_nx,
|
|
184
|
+
const float *mat2, int mat2_nx,
|
|
185
|
+
int win_width, int rows, int side,
|
|
186
|
+
float *list_metric)
|
|
187
|
+
{
|
|
188
|
+
// rows of the matrix
|
|
189
|
+
const int tid = threadIdx.x;
|
|
190
|
+
// position in list_pos
|
|
191
|
+
const int i = blockIdx.y;
|
|
192
|
+
const int npos = gridDim.y;
|
|
193
|
+
|
|
194
|
+
const int pos = win_width / 2 + i;
|
|
195
|
+
|
|
196
|
+
// offset matrices for position
|
|
197
|
+
const float* mat2_roi = side == 1 ? mat2 : mat2 + mat2_nx - win_width;
|
|
198
|
+
const float* mat1_roi = mat1 + (pos - win_width / 2);
|
|
199
|
+
|
|
200
|
+
extern __shared__ float smem[];
|
|
201
|
+
|
|
202
|
+
// we need to space for 6 sum reductions for calculating the correlation coefficient
|
|
203
|
+
float v[6];
|
|
204
|
+
|
|
205
|
+
float d_ramp = 1.0f / (win_width - 1);
|
|
206
|
+
|
|
207
|
+
////////////////////////
|
|
208
|
+
// 1. We need the mean of the 3 matrices (flattend)
|
|
209
|
+
v[0] = 0.0f;
|
|
210
|
+
v[1] = 0.0f;
|
|
211
|
+
v[2] = 0.0f;
|
|
212
|
+
for (int y = tid; y < rows; y += blockDim.x)
|
|
213
|
+
{
|
|
214
|
+
float norm_factor = 1.0f;
|
|
215
|
+
if (norm) {
|
|
216
|
+
norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
|
|
217
|
+
sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
|
|
218
|
+
}
|
|
219
|
+
for (int x = 0; x < win_width; ++x)
|
|
220
|
+
{
|
|
221
|
+
float ramp_down = 1.0f - (x * d_ramp);
|
|
222
|
+
float ramp_up = 1.0f - ramp_down;
|
|
223
|
+
float mat1_roi_val = mat1_roi[y * mat1_nx + x] * norm_factor;
|
|
224
|
+
float mat2_roi_val = mat2_roi[y * mat2_nx + x];
|
|
225
|
+
float mat_comb_val = side == 1 ?
|
|
226
|
+
(mat1_roi_val * ramp_down + mat2_roi_val * ramp_up) :
|
|
227
|
+
(mat1_roi_val * ramp_up + mat2_roi_val * ramp_down);
|
|
228
|
+
|
|
229
|
+
v[0] += mat1_roi_val;
|
|
230
|
+
v[1] += mat2_roi_val;
|
|
231
|
+
v[2] += mat_comb_val;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
sum_reduction_n<3>(smem, v);
|
|
236
|
+
|
|
237
|
+
float mean_mat1 = v[0] / rows / win_width;
|
|
238
|
+
float mean_mat2 = v[1] / rows / win_width;
|
|
239
|
+
float mean_mat3 = v[2] / rows / win_width;
|
|
240
|
+
|
|
241
|
+
///////////////////////////////////
|
|
242
|
+
// 2. Calculate the sum of the dot and cross-products for the 3 matrices:
|
|
243
|
+
v[0] = 0.0f; // dot(mat1, mat1)
|
|
244
|
+
v[1] = 0.0f; // dot(mat2, mat2)
|
|
245
|
+
v[2] = 0.0f; // dot(mat_comb, mat_comb)
|
|
246
|
+
v[3] = 0.0f; // dot(mat1, mat2)
|
|
247
|
+
v[4] = 0.0f; // dot(mat1, mat_comb)
|
|
248
|
+
v[5] = 0.0f; // dot(mat2, mat_comb)
|
|
249
|
+
|
|
250
|
+
for (int y = tid; y < rows; y += blockDim.x)
|
|
251
|
+
{
|
|
252
|
+
float norm_factor = 1.0f;
|
|
253
|
+
if (norm) {
|
|
254
|
+
norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
|
|
255
|
+
sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
|
|
256
|
+
}
|
|
257
|
+
for (int x = 0; x < win_width; ++x)
|
|
258
|
+
{
|
|
259
|
+
float ramp_down = 1.0f - (x * d_ramp);
|
|
260
|
+
float ramp_up = 1.0f - ramp_down;
|
|
261
|
+
float mat1_roi_val = mat1_roi[y * mat1_nx + x] * norm_factor;
|
|
262
|
+
float mat2_roi_val = mat2_roi[y * mat2_nx + x];
|
|
263
|
+
float mat_comb_val = side == 1 ?
|
|
264
|
+
(mat1_roi_val * ramp_down + mat2_roi_val * ramp_up) :
|
|
265
|
+
(mat1_roi_val * ramp_up + mat2_roi_val * ramp_down);
|
|
266
|
+
|
|
267
|
+
// for covariance matrix, we need to remove the mean first
|
|
268
|
+
mat1_roi_val -= mean_mat1;
|
|
269
|
+
mat2_roi_val -= mean_mat2;
|
|
270
|
+
mat_comb_val -= mean_mat3;
|
|
271
|
+
|
|
272
|
+
// now sum the products
|
|
273
|
+
v[0] += mat1_roi_val * mat1_roi_val;
|
|
274
|
+
v[1] += mat2_roi_val * mat2_roi_val;
|
|
275
|
+
v[2] += mat_comb_val * mat_comb_val;
|
|
276
|
+
v[3] += mat1_roi_val * mat2_roi_val;
|
|
277
|
+
v[4] += mat1_roi_val * mat_comb_val;
|
|
278
|
+
v[5] += mat2_roi_val * mat_comb_val;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// 6 smem reductions
|
|
284
|
+
sum_reduction_n<6>(smem, v);
|
|
285
|
+
|
|
286
|
+
////////////////////////////////
|
|
287
|
+
// 3. Calculate the correlation coefficients from the covariance values
|
|
288
|
+
if (tid == 0)
|
|
289
|
+
{
|
|
290
|
+
// mean values
|
|
291
|
+
float mat1_mat1 = v[0] / (rows * win_width - 1);
|
|
292
|
+
float mat2_mat2 = v[1] / (rows * win_width - 1);
|
|
293
|
+
float mat3_mat3 = v[2] / (rows * win_width - 1);
|
|
294
|
+
float mat1_mat2 = v[3] / (rows * win_width - 1);
|
|
295
|
+
float mat1_mat3 = v[4] / (rows * win_width - 1);
|
|
296
|
+
float mat2_mat3 = v[5] / (rows * win_width - 1);
|
|
297
|
+
// normalise to get correlation coefficients
|
|
298
|
+
float r12 = mat1_mat2 / sqrt(mat1_mat1 * mat2_mat2);
|
|
299
|
+
float r13 = mat1_mat3 / sqrt(mat1_mat1 * mat3_mat3);
|
|
300
|
+
float r23 = mat2_mat3 / sqrt(mat2_mat2 * mat3_mat3);
|
|
301
|
+
// clip
|
|
302
|
+
r12 = clip(r12, -1.0f, 1.0f);
|
|
303
|
+
r13 = clip(r13, -1.0f, 1.0f);
|
|
304
|
+
r23 = clip(r23, -1.0f, 1.0f);
|
|
305
|
+
// metric
|
|
306
|
+
float metric_1 = abs(1.0f - r12);
|
|
307
|
+
float metric_2 = abs(1.0f - r23);
|
|
308
|
+
float metric_3 = abs(1.0f - r13);
|
|
309
|
+
// average and output
|
|
310
|
+
list_metric[i] = (metric_1 + metric_2 + metric_3) / 3.0f;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
/** Main entry point - it calls one of the two variants above.
|
|
319
|
+
*
|
|
320
|
+
* We use a template here, so that one of the two branches gets completely eliminated by
|
|
321
|
+
* the compiler (rather than at runtime), which reduces the register count.
|
|
322
|
+
*/
|
|
323
|
+
template <bool norm, bool use_overlap>
|
|
324
|
+
__global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
|
|
325
|
+
const float *mat2, int mat2_nx,
|
|
326
|
+
int win_width, int rows, int side,
|
|
327
|
+
float *list_metric)
|
|
328
|
+
{
|
|
329
|
+
if (use_overlap) {
|
|
330
|
+
_calc_metrics_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
|
|
331
|
+
} else {
|
|
332
|
+
_calc_metrics_no_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#include <cupy/complex.cuh>
|
|
2
|
+
|
|
3
|
+
extern "C" __global__ void
|
|
4
|
+
shift_whole_shifts(const float *sino2, const float *sino3,
|
|
5
|
+
const float *__restrict__ list_shift, float *mat, int nx,
|
|
6
|
+
int nymat) {
|
|
7
|
+
int xid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
8
|
+
int yid = blockIdx.y;
|
|
9
|
+
int zid = blockIdx.z;
|
|
10
|
+
int ny = gridDim.y;
|
|
11
|
+
|
|
12
|
+
if (xid >= nx)
|
|
13
|
+
return;
|
|
14
|
+
|
|
15
|
+
float shift_col = list_shift[zid];
|
|
16
|
+
float int_part = 0.0;
|
|
17
|
+
float frac_part = modf(shift_col, &int_part);
|
|
18
|
+
if (abs(frac_part) > 1e-5f) {
|
|
19
|
+
// we have a floating point shift, so we only roll in
|
|
20
|
+
// sino3, but we leave the rest for later using scipy
|
|
21
|
+
int shift_int =
|
|
22
|
+
shift_col >= 0.0 ? int(ceil(shift_col)) : int(floor(shift_col));
|
|
23
|
+
if (shift_int >= 0 && xid < shift_int) {
|
|
24
|
+
mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
|
|
25
|
+
}
|
|
26
|
+
if (shift_int < 0 && xid >= nx + shift_int) {
|
|
27
|
+
mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
|
|
28
|
+
}
|
|
29
|
+
} else {
|
|
30
|
+
// we have an integer shift, so we can roll in directly
|
|
31
|
+
// by indexing
|
|
32
|
+
int shift_int = int(shift_col);
|
|
33
|
+
if (shift_int >= 0) {
|
|
34
|
+
if (xid >= shift_int) {
|
|
35
|
+
mat[zid * nymat * nx + yid * nx + xid] =
|
|
36
|
+
sino2[yid * nx + xid - shift_int];
|
|
37
|
+
} else {
|
|
38
|
+
mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
|
|
39
|
+
}
|
|
40
|
+
} else {
|
|
41
|
+
if (xid < nx + shift_int) {
|
|
42
|
+
mat[zid * nymat * nx + yid * nx + xid] =
|
|
43
|
+
sino2[yid * nx + xid - shift_int];
|
|
44
|
+
} else {
|
|
45
|
+
mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
extern "C" __global__ void downsample_sino(float *sino, int dx, int dz,
|
|
2
|
+
int level, float *out) {
|
|
3
|
+
// use shared memory to store the values used to "merge" columns of the
|
|
4
|
+
// sinogram in the downsampling process
|
|
5
|
+
extern __shared__ float downsampled_vals[];
|
|
6
|
+
unsigned int binsize, i, j, k, orig_ind, out_ind, output_bin_no;
|
|
7
|
+
i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
8
|
+
j = 0;
|
|
9
|
+
k = blockDim.y * blockIdx.y + threadIdx.y;
|
|
10
|
+
orig_ind = (k * dz) + i;
|
|
11
|
+
binsize = 1 << level;
|
|
12
|
+
unsigned int dz_downsampled =
|
|
13
|
+
__float2uint_rd(fdividef(__uint2float_rd(dz), __uint2float_rd(binsize)));
|
|
14
|
+
unsigned int i_downsampled =
|
|
15
|
+
__float2uint_rd(fdividef(__uint2float_rd(i), __uint2float_rd(binsize)));
|
|
16
|
+
if (orig_ind < dx * dz) {
|
|
17
|
+
output_bin_no =
|
|
18
|
+
__float2uint_rd(fdividef(__uint2float_rd(i), __uint2float_rd(binsize)));
|
|
19
|
+
out_ind = (k * dz_downsampled) + i_downsampled;
|
|
20
|
+
downsampled_vals[threadIdx.y * 8 + threadIdx.x] =
|
|
21
|
+
sino[orig_ind] / __uint2float_rd(binsize);
|
|
22
|
+
// synchronise threads within thread-block so that it's guaranteed
|
|
23
|
+
// that all the required values have been copied into shared memeory
|
|
24
|
+
// to then sum and save in the downsampled output
|
|
25
|
+
__syncthreads();
|
|
26
|
+
// arbitrarily use the "beginning thread" in each "lot" of pixels
|
|
27
|
+
// for downsampling to then save the desired value in the
|
|
28
|
+
// downsampled output array
|
|
29
|
+
if (i % 4 == 0) {
|
|
30
|
+
out[out_ind] = downsampled_vals[threadIdx.y * 8 + threadIdx.x] +
|
|
31
|
+
downsampled_vals[threadIdx.y * 8 + threadIdx.x + 1] +
|
|
32
|
+
downsampled_vals[threadIdx.y * 8 + threadIdx.x + 2] +
|
|
33
|
+
downsampled_vals[threadIdx.y * 8 + threadIdx.x + 3];
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
extern "C" __global__ void generate_mask(const int ncol, const int nrow,
|
|
2
|
+
const int cen_col, const int cen_row,
|
|
3
|
+
const float du, const float dv,
|
|
4
|
+
const float radius, const float drop,
|
|
5
|
+
unsigned short *mask) {
|
|
6
|
+
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
7
|
+
int j = blockIdx.y;
|
|
8
|
+
|
|
9
|
+
if (i >= ncol/2+1)
|
|
10
|
+
return;
|
|
11
|
+
|
|
12
|
+
// we only need to look at the right half as we're using a real2complex FFT
|
|
13
|
+
int outi = i;
|
|
14
|
+
i += ncol/2-1;
|
|
15
|
+
|
|
16
|
+
int pos = __float2int_ru(((j - cen_row) * dv / radius) / du);
|
|
17
|
+
int pos1 = -pos + cen_col;
|
|
18
|
+
int pos2 = pos + cen_col;
|
|
19
|
+
|
|
20
|
+
if (pos1 > pos2) {
|
|
21
|
+
int temp = pos1;
|
|
22
|
+
pos1 = pos2;
|
|
23
|
+
pos2 = temp;
|
|
24
|
+
if (pos1 >= ncol) {
|
|
25
|
+
pos1 = ncol - 1;
|
|
26
|
+
}
|
|
27
|
+
if (pos2 < 0) {
|
|
28
|
+
pos2 = 0;
|
|
29
|
+
}
|
|
30
|
+
} else {
|
|
31
|
+
if (pos1 < 0) {
|
|
32
|
+
pos1 = 0;
|
|
33
|
+
}
|
|
34
|
+
if (pos2 >= ncol) {
|
|
35
|
+
pos2 = ncol - 1;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
short outval = (pos1 <= i && i <= pos2) ? 1 : 0;
|
|
40
|
+
|
|
41
|
+
// mask[cen_row - drop: cen_row + drop + 1, :] = 0
|
|
42
|
+
if (j >= cen_row - drop && j <= cen_row + drop) {
|
|
43
|
+
outval = 0;
|
|
44
|
+
}
|
|
45
|
+
// mask[:, cen_col - 1: cen_col + 2] = 0
|
|
46
|
+
if (i >= cen_col - 1 && i <= cen_col + 1) {
|
|
47
|
+
outval = 0;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
mask[j * (ncol/2+1) + outi] = outval;
|
|
51
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
template <typename Type, int diameter>
|
|
2
|
+
__global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
|
|
3
|
+
int Z, int M, int N) {
|
|
4
|
+
constexpr int radius = diameter / 2;
|
|
5
|
+
constexpr int d3 = diameter * diameter * diameter;
|
|
6
|
+
constexpr int midpoint = d3 / 2;
|
|
7
|
+
|
|
8
|
+
Type ValVec[d3];
|
|
9
|
+
const long i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
10
|
+
const long j = blockDim.y * blockIdx.y + threadIdx.y;
|
|
11
|
+
const long k = blockDim.z * blockIdx.z + threadIdx.z;
|
|
12
|
+
|
|
13
|
+
if (i >= N || j >= M || k >= Z)
|
|
14
|
+
return;
|
|
15
|
+
|
|
16
|
+
long long index = static_cast<long long>(i) + N * static_cast<long long>(j) + N * M * static_cast<long long>(k);
|
|
17
|
+
|
|
18
|
+
int counter = 0;
|
|
19
|
+
for (int i_m = -radius; i_m <= radius; i_m++) {
|
|
20
|
+
long long i1 = i + i_m; // using long long to avoid integer overflows
|
|
21
|
+
if ((i1 < 0) || (i1 >= N))
|
|
22
|
+
i1 = i;
|
|
23
|
+
for (int j_m = -radius; j_m <= radius; j_m++) {
|
|
24
|
+
long long j1 = j + j_m;
|
|
25
|
+
if ((j1 < 0) || (j1 >= M))
|
|
26
|
+
j1 = j;
|
|
27
|
+
for (int k_m = -radius; k_m <= radius; k_m++) {
|
|
28
|
+
long long k1 = k + k_m;
|
|
29
|
+
if ((k1 < 0) || (k1 >= Z))
|
|
30
|
+
k1 = k;
|
|
31
|
+
ValVec[counter] = in[i1 + N * j1 + N * M * k1];
|
|
32
|
+
counter++;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/* do bubble sort here */
|
|
38
|
+
for (int x = 0; x < d3 - 1; x++) {
|
|
39
|
+
for (int y = 0; y < d3 - x - 1; y++) {
|
|
40
|
+
if (ValVec[y] > ValVec[y + 1]) {
|
|
41
|
+
Type temp = ValVec[y];
|
|
42
|
+
ValVec[y] = ValVec[y + 1];
|
|
43
|
+
ValVec[y + 1] = temp;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (dif > 0.0f) {
|
|
49
|
+
/* perform dezingering */
|
|
50
|
+
out[index] =
|
|
51
|
+
fabsf(in[index] - ValVec[midpoint]) >= dif ? ValVec[midpoint] : in[index];
|
|
52
|
+
}
|
|
53
|
+
else out[index] = ValVec[midpoint]; /* median filtering */
|
|
54
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#include <cupy/complex.cuh>
|
|
2
|
+
|
|
3
|
+
#ifndef M_PI
|
|
4
|
+
#define M_PI 3.1415926535897932384626433832795f
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
extern "C" __global__ void
|
|
8
|
+
paganin_filter_gen(int width1, int height1, float resolution, float wavelength,
|
|
9
|
+
float distance, float ratio, complex<float> *filtercomplex) {
|
|
10
|
+
int px = threadIdx.x + blockIdx.x * blockDim.x;
|
|
11
|
+
int py = threadIdx.y + blockIdx.y * blockDim.y;
|
|
12
|
+
if (px >= width1)
|
|
13
|
+
return;
|
|
14
|
+
if (py >= height1)
|
|
15
|
+
return;
|
|
16
|
+
|
|
17
|
+
float dpx = 1.0f / (width1 * resolution);
|
|
18
|
+
float dpy = 1.0f / (height1 * resolution);
|
|
19
|
+
int centerx = (width1 + 1) / 2 - 1;
|
|
20
|
+
int centery = (height1 + 1) / 2 - 1;
|
|
21
|
+
|
|
22
|
+
float pxx = (px - centerx) * dpx;
|
|
23
|
+
float pyy = (py - centery) * dpy;
|
|
24
|
+
float pd = (pxx * pxx + pyy * pyy) * wavelength * distance * M_PI;
|
|
25
|
+
;
|
|
26
|
+
float filter1 = 1.0f + ratio * pd;
|
|
27
|
+
|
|
28
|
+
complex<float> value = 1.0f / complex<float>(filter1, filter1);
|
|
29
|
+
|
|
30
|
+
// ifftshifting positions
|
|
31
|
+
int xshift = (width1 + 1) / 2;
|
|
32
|
+
int yshift = (height1 + 1) / 2;
|
|
33
|
+
int outX = (px + xshift) % width1;
|
|
34
|
+
int outY = (py + yshift) % height1;
|
|
35
|
+
|
|
36
|
+
filtercomplex[outY * width1 + outX] = value;
|
|
37
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
cupy_run = False
|
|
2
|
+
try:
|
|
3
|
+
import cupy as cp
|
|
4
|
+
import nvtx
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
cp.cuda.Device(0).compute_capability
|
|
8
|
+
cupy_run = True
|
|
9
|
+
except cp.cuda.runtime.CUDARuntimeError:
|
|
10
|
+
print("CuPy library is a major dependency for HTTomolibgpu, please install")
|
|
11
|
+
import numpy as cp
|
|
12
|
+
except ImportError as e:
|
|
13
|
+
print(
|
|
14
|
+
f"Failed to import module in {__file__} with error: {e}; defaulting to CPU-only mode"
|
|
15
|
+
)
|
|
16
|
+
from unittest.mock import Mock
|
|
17
|
+
import numpy as cp
|
|
18
|
+
|
|
19
|
+
nvtx = Mock()
|
|
File without changes
|