httomolibgpu 2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ from httomolibgpu.misc.corr import median_filter, remove_outlier
2
+ from httomolibgpu.misc.morph import sino_360_to_180, data_resampler
3
+ from httomolibgpu.misc.rescale import rescale_to_int
4
+ from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
5
+ from httomolibgpu.prep.normalize import normalize
6
+ from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
7
+ from httomolibgpu.prep.stripe import (
8
+ remove_stripe_based_sorting,
9
+ remove_stripe_ti,
10
+ remove_all_stripe,
11
+ )
12
+
13
+ from httomolibgpu.recon.algorithm import FBP, LPRec, SIRT, CGLS
14
+ from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
@@ -0,0 +1,29 @@
1
+ import os
2
+ from typing import List, Optional, Tuple
3
+
4
+ from httomolibgpu import cupywrapper
5
+
6
+ cp = cupywrapper.cp
7
+
8
+
9
+ def load_cuda_module(
10
+ file: str,
11
+ name_expressions: Optional[List[str]] = None,
12
+ options: Tuple[str, ...] = tuple(),
13
+ ) -> cp.RawModule:
14
+ """Load a CUDA module file, i.e. a .cu file, from the file system,
15
+ compile it, and return is as a CuPy RawModule for further
16
+ processing.
17
+ """
18
+
19
+ dir = os.path.dirname(os.path.abspath(__file__))
20
+ file = os.path.join(dir, file + ".cu")
21
+ # insert a preprocessor line directive to assist compiler errors (so line numbers show correctly in output)
22
+ escaped = file.replace("\\", "\\\\")
23
+ code = '#line 1 "{}"\n'.format(escaped)
24
+ with open(file, "r") as f:
25
+ code += f.read()
26
+
27
+ return cp.RawModule(
28
+ options=("-std=c++11", *options), code=code, name_expressions=name_expressions
29
+ )
@@ -0,0 +1,334 @@
1
+ /*********************************************************************
2
+ * Calculate correlation-based metrics for the find_center_360 method
3
+ *********************************************************************
4
+ *
5
+ * The core of the find_center_360 method is calculating correlation coefficients
6
+ * between 2 or 3 shifted matrices, over many shifting positions.
7
+ *
8
+ * This file has cuda kernels for this purpose, which provide speedups of > 300x
9
+ * compared to using a straight numpy to cupy port.
10
+ *
11
+ * The key is the formula to calculate the Peason correlation coefficient.
12
+ * This is calculated manually for every shifted matrix position in the same kernel.
13
+ *
14
+ * The correlation coefficient between two vectors (we flatten the matrices) is:
15
+ *
16
+ * m1_norm = m1 - mean(m1)
17
+ * m2_norm = m2 - mean(m2)
18
+ * m1_sqr = dot(m1_norm, m1_norm)
19
+ * m2_sqr = dot(m2_norm, m2_norm)
20
+ * m1_m2 = dot(m1_norm, m2_norm)
21
+ * r = m1_m2 / sqrt(m1_sqr * m2_sqr)
22
+ *
23
+ * The kernels in the following compute these directly pretty much, taking into
24
+ * consideration normalisation, overlaps, and position offsets. Also note that the
25
+ * version with overlap requries 3 correlation coefficients (between 3 matrices).
26
+ */
27
+
28
+
29
+ /** Function to perform a binary sum reduction for N-dimensional array storage.
30
+ * Note that the shared_mem pointer must have space for N * BLOCK_DIM elements.
31
+ */
32
+ template <int N, int BLOCK_DIM=128>
33
+ __device__ inline
34
+ void sum_reduction_n(float* shared_mem, float v[N]) {
35
+ int tid = threadIdx.x;
36
+
37
+ float *smem[N];
38
+ #pragma unroll
39
+ for (int i = 0; i < N; ++i) {
40
+ smem[i] = shared_mem + i * BLOCK_DIM;
41
+ smem[i][tid] = v[i];
42
+ }
43
+
44
+ __syncthreads();
45
+ int nt = BLOCK_DIM;
46
+ int c = nt;
47
+ while (c > 1)
48
+ {
49
+ int half = c / 2;
50
+ if (tid < half)
51
+ {
52
+ #pragma unroll
53
+ for (int i = 0; i < N; ++i) {
54
+ smem[i][tid] += smem[i][c - tid - 1];
55
+ }
56
+ }
57
+ __syncthreads();
58
+ c = c - half;
59
+ }
60
+
61
+ // write back
62
+ #pragma unroll
63
+ for (int i = 0; i < N; ++i) {
64
+ v[i] = smem[i][0];
65
+ }
66
+ }
67
+
68
+ inline __device__
69
+ float clip(float x, float min, float max) {
70
+ x = x < -1.0f ? -1.0f : x;
71
+ x = x > 1.0f ? 1.0f : x;
72
+ return x;
73
+ }
74
+
75
+ __device__ inline
76
+ float sum_abs_row(const float* row, int win_width)
77
+ {
78
+ float sum_abs = 0.0;
79
+ for (int x = 0; x < win_width; ++x) {
80
+ sum_abs += abs(row[x]);
81
+ }
82
+ return sum_abs;
83
+ }
84
+
85
+
86
+ /** Compute function without overlap, where correlation metrics with 2 matrices are calculated. */
87
+ template <bool norm>
88
+ __device__ void _calc_metrics_no_overlap(const float *mat1, int mat1_nx,
89
+ const float *mat2, int mat2_nx,
90
+ int win_width, int rows, int side,
91
+ float *list_metric)
92
+ {
93
+ // rows of the matrix
94
+ const int tid = threadIdx.x;
95
+ // position in list_pos
96
+ const int i = blockIdx.y;
97
+ const int npos = gridDim.y;
98
+
99
+ const int pos = win_width / 2 + i;
100
+
101
+ // offset matrices for position
102
+ const float* mat2_roi = side == 1 ? mat2 : mat2 + mat2_nx - win_width;
103
+ const float* mat1_roi = mat1 + (pos - win_width / 2);
104
+
105
+ extern __shared__ float smem[];
106
+
107
+ // we store our data for reductions here
108
+ float v[3];
109
+
110
+ ////////////////////////
111
+ // 1. We need the mean of the 2 matrices (flattend)
112
+ v[0] = 0.0f;
113
+ v[1] = 0.0f;
114
+
115
+ for (int y = tid; y < rows; y += blockDim.x)
116
+ {
117
+ float norm_factor = 1.0f;
118
+ if (norm) {
119
+ norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
120
+ sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
121
+ }
122
+ for (int x = 0; x < win_width; ++x)
123
+ {
124
+ v[0] += mat1_roi[y * mat1_nx + x] * norm_factor;
125
+ v[1] += mat2_roi[y * mat2_nx + x];
126
+ }
127
+ }
128
+
129
+ // now reduce them to calc the mean
130
+ sum_reduction_n<2>(smem, v);
131
+
132
+ float mean_mat1 = v[0] / rows / win_width;
133
+ float mean_mat2 = v[1] / rows / win_width;
134
+
135
+ ///////////////////////////////////
136
+ // 2. Calculate the sum of the dot and cross-products for the 3 matrices:
137
+ v[0] = 0.0f; // dot(mat1, mat1)
138
+ v[1] = 0.0f; // dot(mat2, mat2)
139
+ v[2] = 0.0f; // dot(mat1, mat2)
140
+
141
+ for (int y = tid; y < rows; y += blockDim.x)
142
+ {
143
+ float norm_factor = 1.0f;
144
+ if (norm) {
145
+ norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
146
+ sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
147
+ }
148
+ for (int x = 0; x < win_width; ++x)
149
+ {
150
+ float mat1_roi_val = mat1_roi[y * mat1_nx + x] * norm_factor;
151
+ float mat2_roi_val = mat2_roi[y * mat2_nx + x];
152
+ mat1_roi_val -= mean_mat1;
153
+ mat2_roi_val -= mean_mat2;
154
+ v[0] += mat1_roi_val * mat1_roi_val;
155
+ v[1] += mat2_roi_val * mat2_roi_val;
156
+ v[2] += mat1_roi_val * mat2_roi_val;
157
+ }
158
+ }
159
+
160
+ // now reduce them to calc the mean
161
+ sum_reduction_n<3>(smem, v);
162
+
163
+ ////////////////////////////////
164
+ // 3. Calculate the correlation coefficients from the covariance values
165
+ if (tid == 0)
166
+ {
167
+ // we actually need the mean of the squares
168
+ float mat1_mat1 = v[0] / (rows * win_width - 1);
169
+ float mat2_mat2 = v[1] / (rows * win_width - 1);
170
+ float mat1_mat2 = v[2] / (rows * win_width - 1);
171
+ // not calculate correlation coeffiecient
172
+ float r = mat1_mat2 / sqrt(mat1_mat1 * mat2_mat2);
173
+ r = clip(r, -1.0f, 1.0f);
174
+ // metric
175
+ float metric = abs(1.0f - r);
176
+ list_metric[i] = metric;
177
+ }
178
+ }
179
+
180
+
181
+ /** Compute function with overlap, where correlation metrics with 3 matrices are calculated. */
182
+ template <bool norm>
183
+ __device__ void _calc_metrics_overlap(const float *mat1, int mat1_nx,
184
+ const float *mat2, int mat2_nx,
185
+ int win_width, int rows, int side,
186
+ float *list_metric)
187
+ {
188
+ // rows of the matrix
189
+ const int tid = threadIdx.x;
190
+ // position in list_pos
191
+ const int i = blockIdx.y;
192
+ const int npos = gridDim.y;
193
+
194
+ const int pos = win_width / 2 + i;
195
+
196
+ // offset matrices for position
197
+ const float* mat2_roi = side == 1 ? mat2 : mat2 + mat2_nx - win_width;
198
+ const float* mat1_roi = mat1 + (pos - win_width / 2);
199
+
200
+ extern __shared__ float smem[];
201
+
202
+ // we need to space for 6 sum reductions for calculating the correlation coefficient
203
+ float v[6];
204
+
205
+ float d_ramp = 1.0f / (win_width - 1);
206
+
207
+ ////////////////////////
208
+ // 1. We need the mean of the 3 matrices (flattend)
209
+ v[0] = 0.0f;
210
+ v[1] = 0.0f;
211
+ v[2] = 0.0f;
212
+ for (int y = tid; y < rows; y += blockDim.x)
213
+ {
214
+ float norm_factor = 1.0f;
215
+ if (norm) {
216
+ norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
217
+ sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
218
+ }
219
+ for (int x = 0; x < win_width; ++x)
220
+ {
221
+ float ramp_down = 1.0f - (x * d_ramp);
222
+ float ramp_up = 1.0f - ramp_down;
223
+ float mat1_roi_val = mat1_roi[y * mat1_nx + x] * norm_factor;
224
+ float mat2_roi_val = mat2_roi[y * mat2_nx + x];
225
+ float mat_comb_val = side == 1 ?
226
+ (mat1_roi_val * ramp_down + mat2_roi_val * ramp_up) :
227
+ (mat1_roi_val * ramp_up + mat2_roi_val * ramp_down);
228
+
229
+ v[0] += mat1_roi_val;
230
+ v[1] += mat2_roi_val;
231
+ v[2] += mat_comb_val;
232
+ }
233
+ }
234
+
235
+ sum_reduction_n<3>(smem, v);
236
+
237
+ float mean_mat1 = v[0] / rows / win_width;
238
+ float mean_mat2 = v[1] / rows / win_width;
239
+ float mean_mat3 = v[2] / rows / win_width;
240
+
241
+ ///////////////////////////////////
242
+ // 2. Calculate the sum of the dot and cross-products for the 3 matrices:
243
+ v[0] = 0.0f; // dot(mat1, mat1)
244
+ v[1] = 0.0f; // dot(mat2, mat2)
245
+ v[2] = 0.0f; // dot(mat_comb, mat_comb)
246
+ v[3] = 0.0f; // dot(mat1, mat2)
247
+ v[4] = 0.0f; // dot(mat1, mat_comb)
248
+ v[5] = 0.0f; // dot(mat2, mat_comb)
249
+
250
+ for (int y = tid; y < rows; y += blockDim.x)
251
+ {
252
+ float norm_factor = 1.0f;
253
+ if (norm) {
254
+ norm_factor = sum_abs_row(&mat2_roi[y * mat2_nx], win_width) /
255
+ sum_abs_row(&mat1_roi[y * mat1_nx], win_width);
256
+ }
257
+ for (int x = 0; x < win_width; ++x)
258
+ {
259
+ float ramp_down = 1.0f - (x * d_ramp);
260
+ float ramp_up = 1.0f - ramp_down;
261
+ float mat1_roi_val = mat1_roi[y * mat1_nx + x] * norm_factor;
262
+ float mat2_roi_val = mat2_roi[y * mat2_nx + x];
263
+ float mat_comb_val = side == 1 ?
264
+ (mat1_roi_val * ramp_down + mat2_roi_val * ramp_up) :
265
+ (mat1_roi_val * ramp_up + mat2_roi_val * ramp_down);
266
+
267
+ // for covariance matrix, we need to remove the mean first
268
+ mat1_roi_val -= mean_mat1;
269
+ mat2_roi_val -= mean_mat2;
270
+ mat_comb_val -= mean_mat3;
271
+
272
+ // now sum the products
273
+ v[0] += mat1_roi_val * mat1_roi_val;
274
+ v[1] += mat2_roi_val * mat2_roi_val;
275
+ v[2] += mat_comb_val * mat_comb_val;
276
+ v[3] += mat1_roi_val * mat2_roi_val;
277
+ v[4] += mat1_roi_val * mat_comb_val;
278
+ v[5] += mat2_roi_val * mat_comb_val;
279
+ }
280
+
281
+ }
282
+
283
+ // 6 smem reductions
284
+ sum_reduction_n<6>(smem, v);
285
+
286
+ ////////////////////////////////
287
+ // 3. Calculate the correlation coefficients from the covariance values
288
+ if (tid == 0)
289
+ {
290
+ // mean values
291
+ float mat1_mat1 = v[0] / (rows * win_width - 1);
292
+ float mat2_mat2 = v[1] / (rows * win_width - 1);
293
+ float mat3_mat3 = v[2] / (rows * win_width - 1);
294
+ float mat1_mat2 = v[3] / (rows * win_width - 1);
295
+ float mat1_mat3 = v[4] / (rows * win_width - 1);
296
+ float mat2_mat3 = v[5] / (rows * win_width - 1);
297
+ // normalise to get correlation coefficients
298
+ float r12 = mat1_mat2 / sqrt(mat1_mat1 * mat2_mat2);
299
+ float r13 = mat1_mat3 / sqrt(mat1_mat1 * mat3_mat3);
300
+ float r23 = mat2_mat3 / sqrt(mat2_mat2 * mat3_mat3);
301
+ // clip
302
+ r12 = clip(r12, -1.0f, 1.0f);
303
+ r13 = clip(r13, -1.0f, 1.0f);
304
+ r23 = clip(r23, -1.0f, 1.0f);
305
+ // metric
306
+ float metric_1 = abs(1.0f - r12);
307
+ float metric_2 = abs(1.0f - r23);
308
+ float metric_3 = abs(1.0f - r13);
309
+ // average and output
310
+ list_metric[i] = (metric_1 + metric_2 + metric_3) / 3.0f;
311
+ }
312
+ }
313
+
314
+
315
+
316
+
317
+
318
+ /** Main entry point - it calls one of the two variants above.
319
+ *
320
+ * We use a template here, so that one of the two branches gets completely eliminated by
321
+ * the compiler (rather than at runtime), which reduces the register count.
322
+ */
323
+ template <bool norm, bool use_overlap>
324
+ __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
325
+ const float *mat2, int mat2_nx,
326
+ int win_width, int rows, int side,
327
+ float *list_metric)
328
+ {
329
+ if (use_overlap) {
330
+ _calc_metrics_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
331
+ } else {
332
+ _calc_metrics_no_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
333
+ }
334
+ }
@@ -0,0 +1,49 @@
1
+ #include <cupy/complex.cuh>
2
+
3
+ extern "C" __global__ void
4
+ shift_whole_shifts(const float *sino2, const float *sino3,
5
+ const float *__restrict__ list_shift, float *mat, int nx,
6
+ int nymat) {
7
+ int xid = threadIdx.x + blockIdx.x * blockDim.x;
8
+ int yid = blockIdx.y;
9
+ int zid = blockIdx.z;
10
+ int ny = gridDim.y;
11
+
12
+ if (xid >= nx)
13
+ return;
14
+
15
+ float shift_col = list_shift[zid];
16
+ float int_part = 0.0;
17
+ float frac_part = modf(shift_col, &int_part);
18
+ if (abs(frac_part) > 1e-5f) {
19
+ // we have a floating point shift, so we only roll in
20
+ // sino3, but we leave the rest for later using scipy
21
+ int shift_int =
22
+ shift_col >= 0.0 ? int(ceil(shift_col)) : int(floor(shift_col));
23
+ if (shift_int >= 0 && xid < shift_int) {
24
+ mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
25
+ }
26
+ if (shift_int < 0 && xid >= nx + shift_int) {
27
+ mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
28
+ }
29
+ } else {
30
+ // we have an integer shift, so we can roll in directly
31
+ // by indexing
32
+ int shift_int = int(shift_col);
33
+ if (shift_int >= 0) {
34
+ if (xid >= shift_int) {
35
+ mat[zid * nymat * nx + yid * nx + xid] =
36
+ sino2[yid * nx + xid - shift_int];
37
+ } else {
38
+ mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
39
+ }
40
+ } else {
41
+ if (xid < nx + shift_int) {
42
+ mat[zid * nymat * nx + yid * nx + xid] =
43
+ sino2[yid * nx + xid - shift_int];
44
+ } else {
45
+ mat[zid * nymat * nx + yid * nx + xid] = sino3[yid * nx + xid];
46
+ }
47
+ }
48
+ }
49
+ }
@@ -0,0 +1,36 @@
1
+ extern "C" __global__ void downsample_sino(float *sino, int dx, int dz,
2
+ int level, float *out) {
3
+ // use shared memory to store the values used to "merge" columns of the
4
+ // sinogram in the downsampling process
5
+ extern __shared__ float downsampled_vals[];
6
+ unsigned int binsize, i, j, k, orig_ind, out_ind, output_bin_no;
7
+ i = blockDim.x * blockIdx.x + threadIdx.x;
8
+ j = 0;
9
+ k = blockDim.y * blockIdx.y + threadIdx.y;
10
+ orig_ind = (k * dz) + i;
11
+ binsize = 1 << level;
12
+ unsigned int dz_downsampled =
13
+ __float2uint_rd(fdividef(__uint2float_rd(dz), __uint2float_rd(binsize)));
14
+ unsigned int i_downsampled =
15
+ __float2uint_rd(fdividef(__uint2float_rd(i), __uint2float_rd(binsize)));
16
+ if (orig_ind < dx * dz) {
17
+ output_bin_no =
18
+ __float2uint_rd(fdividef(__uint2float_rd(i), __uint2float_rd(binsize)));
19
+ out_ind = (k * dz_downsampled) + i_downsampled;
20
+ downsampled_vals[threadIdx.y * 8 + threadIdx.x] =
21
+ sino[orig_ind] / __uint2float_rd(binsize);
22
+ // synchronise threads within thread-block so that it's guaranteed
23
+ // that all the required values have been copied into shared memeory
24
+ // to then sum and save in the downsampled output
25
+ __syncthreads();
26
+ // arbitrarily use the "beginning thread" in each "lot" of pixels
27
+ // for downsampling to then save the desired value in the
28
+ // downsampled output array
29
+ if (i % 4 == 0) {
30
+ out[out_ind] = downsampled_vals[threadIdx.y * 8 + threadIdx.x] +
31
+ downsampled_vals[threadIdx.y * 8 + threadIdx.x + 1] +
32
+ downsampled_vals[threadIdx.y * 8 + threadIdx.x + 2] +
33
+ downsampled_vals[threadIdx.y * 8 + threadIdx.x + 3];
34
+ }
35
+ }
36
+ }
@@ -0,0 +1,51 @@
1
+ extern "C" __global__ void generate_mask(const int ncol, const int nrow,
2
+ const int cen_col, const int cen_row,
3
+ const float du, const float dv,
4
+ const float radius, const float drop,
5
+ unsigned short *mask) {
6
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
7
+ int j = blockIdx.y;
8
+
9
+ if (i >= ncol/2+1)
10
+ return;
11
+
12
+ // we only need to look at the right half as we're using a real2complex FFT
13
+ int outi = i;
14
+ i += ncol/2-1;
15
+
16
+ int pos = __float2int_ru(((j - cen_row) * dv / radius) / du);
17
+ int pos1 = -pos + cen_col;
18
+ int pos2 = pos + cen_col;
19
+
20
+ if (pos1 > pos2) {
21
+ int temp = pos1;
22
+ pos1 = pos2;
23
+ pos2 = temp;
24
+ if (pos1 >= ncol) {
25
+ pos1 = ncol - 1;
26
+ }
27
+ if (pos2 < 0) {
28
+ pos2 = 0;
29
+ }
30
+ } else {
31
+ if (pos1 < 0) {
32
+ pos1 = 0;
33
+ }
34
+ if (pos2 >= ncol) {
35
+ pos2 = ncol - 1;
36
+ }
37
+ }
38
+
39
+ short outval = (pos1 <= i && i <= pos2) ? 1 : 0;
40
+
41
+ // mask[cen_row - drop: cen_row + drop + 1, :] = 0
42
+ if (j >= cen_row - drop && j <= cen_row + drop) {
43
+ outval = 0;
44
+ }
45
+ // mask[:, cen_col - 1: cen_col + 2] = 0
46
+ if (i >= cen_col - 1 && i <= cen_col + 1) {
47
+ outval = 0;
48
+ }
49
+
50
+ mask[j * (ncol/2+1) + outi] = outval;
51
+ }
@@ -0,0 +1,54 @@
1
+ template <typename Type, int diameter>
2
+ __global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
3
+ int Z, int M, int N) {
4
+ constexpr int radius = diameter / 2;
5
+ constexpr int d3 = diameter * diameter * diameter;
6
+ constexpr int midpoint = d3 / 2;
7
+
8
+ Type ValVec[d3];
9
+ const long i = blockDim.x * blockIdx.x + threadIdx.x;
10
+ const long j = blockDim.y * blockIdx.y + threadIdx.y;
11
+ const long k = blockDim.z * blockIdx.z + threadIdx.z;
12
+
13
+ if (i >= N || j >= M || k >= Z)
14
+ return;
15
+
16
+ long long index = static_cast<long long>(i) + N * static_cast<long long>(j) + N * M * static_cast<long long>(k);
17
+
18
+ int counter = 0;
19
+ for (int i_m = -radius; i_m <= radius; i_m++) {
20
+ long long i1 = i + i_m; // using long long to avoid integer overflows
21
+ if ((i1 < 0) || (i1 >= N))
22
+ i1 = i;
23
+ for (int j_m = -radius; j_m <= radius; j_m++) {
24
+ long long j1 = j + j_m;
25
+ if ((j1 < 0) || (j1 >= M))
26
+ j1 = j;
27
+ for (int k_m = -radius; k_m <= radius; k_m++) {
28
+ long long k1 = k + k_m;
29
+ if ((k1 < 0) || (k1 >= Z))
30
+ k1 = k;
31
+ ValVec[counter] = in[i1 + N * j1 + N * M * k1];
32
+ counter++;
33
+ }
34
+ }
35
+ }
36
+
37
+ /* do bubble sort here */
38
+ for (int x = 0; x < d3 - 1; x++) {
39
+ for (int y = 0; y < d3 - x - 1; y++) {
40
+ if (ValVec[y] > ValVec[y + 1]) {
41
+ Type temp = ValVec[y];
42
+ ValVec[y] = ValVec[y + 1];
43
+ ValVec[y + 1] = temp;
44
+ }
45
+ }
46
+ }
47
+
48
+ if (dif > 0.0f) {
49
+ /* perform dezingering */
50
+ out[index] =
51
+ fabsf(in[index] - ValVec[midpoint]) >= dif ? ValVec[midpoint] : in[index];
52
+ }
53
+ else out[index] = ValVec[midpoint]; /* median filtering */
54
+ }
@@ -0,0 +1,37 @@
1
+ #include <cupy/complex.cuh>
2
+
3
+ #ifndef M_PI
4
+ #define M_PI 3.1415926535897932384626433832795f
5
+ #endif
6
+
7
+ extern "C" __global__ void
8
+ paganin_filter_gen(int width1, int height1, float resolution, float wavelength,
9
+ float distance, float ratio, complex<float> *filtercomplex) {
10
+ int px = threadIdx.x + blockIdx.x * blockDim.x;
11
+ int py = threadIdx.y + blockIdx.y * blockDim.y;
12
+ if (px >= width1)
13
+ return;
14
+ if (py >= height1)
15
+ return;
16
+
17
+ float dpx = 1.0f / (width1 * resolution);
18
+ float dpy = 1.0f / (height1 * resolution);
19
+ int centerx = (width1 + 1) / 2 - 1;
20
+ int centery = (height1 + 1) / 2 - 1;
21
+
22
+ float pxx = (px - centerx) * dpx;
23
+ float pyy = (py - centery) * dpy;
24
+ float pd = (pxx * pxx + pyy * pyy) * wavelength * distance * M_PI;
25
+ ;
26
+ float filter1 = 1.0f + ratio * pd;
27
+
28
+ complex<float> value = 1.0f / complex<float>(filter1, filter1);
29
+
30
+ // ifftshifting positions
31
+ int xshift = (width1 + 1) / 2;
32
+ int yshift = (height1 + 1) / 2;
33
+ int outX = (px + xshift) % width1;
34
+ int outY = (py + yshift) % height1;
35
+
36
+ filtercomplex[outY * width1 + outX] = value;
37
+ }
@@ -0,0 +1,19 @@
1
+ cupy_run = False
2
+ try:
3
+ import cupy as cp
4
+ import nvtx
5
+
6
+ try:
7
+ cp.cuda.Device(0).compute_capability
8
+ cupy_run = True
9
+ except cp.cuda.runtime.CUDARuntimeError:
10
+ print("CuPy library is a major dependency for HTTomolibgpu, please install")
11
+ import numpy as cp
12
+ except ImportError as e:
13
+ print(
14
+ f"Failed to import module in {__file__} with error: {e}; defaulting to CPU-only mode"
15
+ )
16
+ from unittest.mock import Mock
17
+ import numpy as cp
18
+
19
+ nvtx = Mock()
File without changes