boostrsa 0.0.1.dev5__tar.gz → 0.0.1.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {boostrsa-0.0.1.dev5/src/boostrsa.egg-info → boostrsa-0.0.1.dev8}/PKG-INFO +1 -1
  2. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/setup.py +1 -3
  3. boostrsa-0.0.1.dev8/src/boostrsa/cores/cpgpu/stats.py +182 -0
  4. boostrsa-0.0.1.dev8/src/boostrsa/cores/cpu/mask.py +12 -0
  5. boostrsa-0.0.1.dev8/src/boostrsa/cores/cpu/matrix.py +66 -0
  6. boostrsa-0.0.1.dev8/src/boostrsa/cores/gpu/basic_operations.py +234 -0
  7. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/gpu/mask.py +11 -4
  8. boostrsa-0.0.1.dev8/src/boostrsa/cores/gpu/matrix.py +230 -0
  9. boostrsa-0.0.1.dev8/src/boostrsa/searchlight.py +929 -0
  10. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8/src/boostrsa.egg-info}/PKG-INFO +1 -1
  11. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/SOURCES.txt +1 -0
  12. boostrsa-0.0.1.dev5/src/boostrsa/cores/cpgpu/stats.py +0 -130
  13. boostrsa-0.0.1.dev5/src/boostrsa/cores/cpu/matrix.py +0 -44
  14. boostrsa-0.0.1.dev5/src/boostrsa/cores/gpu/basic_operations.py +0 -61
  15. boostrsa-0.0.1.dev5/src/boostrsa/cores/gpu/matrix.py +0 -125
  16. boostrsa-0.0.1.dev5/src/boostrsa/searchlight.py +0 -233
  17. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/LICENSE.txt +0 -0
  18. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/README.md +0 -0
  19. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/setup.cfg +0 -0
  20. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/__init__.py +0 -0
  21. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/boostrsa_types.py +0 -0
  22. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/__init__.py +0 -0
  23. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/cpgpu/__init__.py +0 -0
  24. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/cpu/__init__.py +0 -0
  25. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/gpu/__init__.py +0 -0
  26. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/dependency_links.txt +0 -0
  27. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/requires.txt +0 -0
  28. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/top_level.txt +0 -0
  29. {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/tests/test_module1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: boostrsa
3
- Version: 0.0.1.dev5
3
+ Version: 0.0.1.dev8
4
4
  Summary: This is toolbox for boosting calculation speed using GPU
5
5
  Home-page: https://github.com/SeojinYoon/boostrsa.git
6
6
  Author: seojin
@@ -1,6 +1,4 @@
1
1
 
2
- import subprocess
3
- import re
4
2
  from setuptools import setup, find_packages
5
3
 
6
4
  with open("README.md", "r") as fh:
@@ -8,7 +6,7 @@ with open("README.md", "r") as fh:
8
6
 
9
7
  setup(
10
8
  name = "boostrsa",
11
- version = "0.0.1dev5",
9
+ version = "0.0.1dev8",
12
10
  author = "seojin",
13
11
  author_email = "pures1@hanyang.ac.kr",
14
12
  description = "This is toolbox for boosting calculation speed using GPU",
@@ -0,0 +1,182 @@
1
+
2
+ import os
3
+ import sys
4
+ import numpy as np
5
+ import cupy as cp
6
+ from numba import cuda, jit
7
+
8
+ if os.getenv("boostrsa_isRunSource"):
9
+ sys.path.append(os.getenv("boostrsa_source_home"))
10
+ from boostrsa_types import ShrinkageMethod
11
+ from cores.gpu.basic_operations import outer_sum_square, outer_sum
12
+ from cores.gpu.matrix import diag, eyes
13
+ from cores.gpu.basic_operations import scaling
14
+ else:
15
+ from boostrsa.boostrsa_types import ShrinkageMethod
16
+ from boostrsa.cores.gpu.basic_operations import outer_sum_square, outer_sum
17
+ from boostrsa.cores.gpu.matrix import diag, eyes
18
+ from boostrsa.cores.gpu.basic_operations import scaling
19
+
20
+ def _covariance_eye(residuals: np.ndarray,
21
+ threads_per_block = 1024,
22
+ dtype = np.float32):
23
+ """
24
+ Computes an optimal shrinkage estimate of a sample covariance matrix as described by the following publication:
25
+ **matrix should be demeaned before!
26
+
27
+ Ledoit and Wolfe (2004): "A well-conditioned estimator for large-dimensional covariance matrices"
28
+
29
+ :param residuals: residual data after processing raw data, shape: (#run * #center, #point, #channel)
30
+ :param threads_per_block: #thread per GPU block
31
+ :param dtype: data type for storing array
32
+ """
33
+ # Constant
34
+ n_processing_unit = len(residuals)
35
+ n_point = residuals.shape[1]
36
+ n_channel = residuals.shape[2]
37
+
38
+ n_block = int(np.ceil(n_processing_unit / threads_per_block))
39
+
40
+ """
41
+ 1. calculate outer product per data of each time
42
+ 2. accumulate the outer product result
43
+
44
+ GPU memory capacity - (#run * #center, #channel, #channel)
45
+ """
46
+ out_sum_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
47
+ outer_sum[n_block, threads_per_block](residuals, out_sum_device)
48
+ outer_sum_result = out_sum_device.copy_to_host()
49
+ del out_sum_device
50
+ cuda.synchronize()
51
+
52
+ """
53
+ 1. calculate outer product per data of each time
54
+ 2. accumulate the outer product result with square operation
55
+
56
+ GPU memory capacity - (#run * #center, #channel, #channel)
57
+ """
58
+ out_sum_square_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
59
+ outer_sum_square[n_block, threads_per_block](residuals, out_sum_square_device)
60
+ outer_sum_square_result = out_sum_square_device.copy_to_host()
61
+ del out_sum_square_device
62
+ cuda.synchronize()
63
+
64
+ # b2
65
+ s = outer_sum_result / n_point
66
+ s2 = outer_sum_square_result / n_point
67
+ b2 = np.sum(s2 - s * s, axis = (1, 2)) / n_point
68
+
69
+ # calculate the scalar estimators to find the optimal shrinkage:
70
+ # m, d^2, b^2 as in Ledoit & Wolfe paper
71
+ # m - shape: (n_processing_unit)
72
+ # d2 - shape: (n_processing_unit)
73
+ # b2 - shape: (n_processing_unit)
74
+ repeat_eyes = np.repeat(np.eye(n_channel)[:, :, np.newaxis], n_processing_unit, axis = 2).T
75
+
76
+ diag_s = np.diagonal(s, axis1 = 1, axis2 = 2)
77
+ m = (np.sum(diag_s, axis = 1) / n_channel)
78
+ d2 = np.sum((s - m[:, None, None] * repeat_eyes) ** 2, axis = (1, 2))
79
+
80
+ b2 = np.minimum(d2, b2)
81
+
82
+ # shrink covariance matrix
83
+ s_shrink = (b2 / d2 * m)[:, None, None] * repeat_eyes + ((d2-b2) / d2)[:, None, None] * s
84
+
85
+ # correction for degrees of freedom
86
+ dof = n_point - 1
87
+ s_shrink = s_shrink * n_point / dof
88
+
89
+ return s_shrink
90
+
91
+ def _covariance_diag(residuals: np.ndarray,
92
+ threads_per_block: int = 1024,
93
+ dtype = np.float32):
94
+ """
95
+ Calculate covariance
96
+
97
+ Schäfer, J., & Strimmer, K. (2005). "A Shrinkage Approach to Large-Scale Covariance Matrix Estimation and Implications for Functional Genomics.
98
+
99
+ :param residuals: residual data after processing raw data, shape: (#run * #center, #point, #channel)
100
+ :param threads_per_block: #thread per GPU block
101
+ :param dtype: data type for storing array
102
+ """
103
+ # Constant
104
+ n_processing_unit = len(residuals)
105
+ n_point = residuals.shape[1]
106
+ n_channel = residuals.shape[2]
107
+
108
+ n_block = int(np.ceil(n_processing_unit / threads_per_block))
109
+
110
+ """
111
+ 1. calculate outer product per data of each time
112
+ 2. accumulate the outer product result
113
+
114
+ GPU memory capacity: (shape: #run * #center * #channel * #channel)
115
+ """
116
+ out_sum_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
117
+ outer_sum[n_block, threads_per_block](residuals, out_sum_device)
118
+ outer_sum_result = out_sum_device.copy_to_host()
119
+ del out_sum_device
120
+ cuda.synchronize()
121
+
122
+ """
123
+ 1. calculate outer product per data of each time
124
+ 2. accumulate the outer product result with square operation
125
+
126
+ GPU memory capacity: (shape: #run * #center * #channel * #channel)
127
+ """
128
+ out_sum_square_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
129
+ outer_sum_square[n_block, threads_per_block](residuals, out_sum_square_device)
130
+ outer_sum_square_result = out_sum_square_device.copy_to_host()
131
+ del out_sum_square_device
132
+ cuda.synchronize()
133
+
134
+ # s
135
+ dof = n_point - 1
136
+ s = outer_sum_result / dof
137
+
138
+ """
139
+ Calculate variance per each channel & run
140
+
141
+ GPU memory capacity: (shape: #run * #center * #channel * dataType)
142
+ """
143
+ stack_var_device = cuda.to_device(np.zeros((n_processing_unit, n_channel)))
144
+ diag[n_block, threads_per_block](s, stack_var_device)
145
+ stack_var = stack_var_device.copy_to_host()
146
+ del stack_var_device
147
+ cuda.synchronize()
148
+
149
+ # std
150
+ stack_std = np.sqrt(stack_var)
151
+
152
+ # sum mean
153
+ stack_s_mean = outer_sum_result / np.expand_dims(stack_std, 1) / np.expand_dims(stack_std, 2) / (n_point - 1)
154
+
155
+ # s2 mean
156
+ stack_s2_mean = outer_sum_square_result / np.expand_dims(stack_var, 1) / np.expand_dims(stack_var, 2) / (n_point - 1)
157
+
158
+ # var_hat
159
+ stack_var_hat = n_point / dof ** 2 * (stack_s2_mean - stack_s_mean ** 2)
160
+
161
+ # mask
162
+ mask = ~np.eye(n_channel, dtype = bool)
163
+
164
+ """
165
+ Scaling
166
+
167
+ GPU memory capacity: (shape: #run * #center * #channel * #channel * dataType)
168
+ """
169
+ stack_scaling_mats_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
170
+ eyes[n_block, threads_per_block](stack_scaling_mats_device)
171
+ cuda.synchronize()
172
+
173
+ stack_lamb_device = np.sum(stack_var_hat[:, mask], axis = 1) / np.sum(stack_s_mean[:, mask] ** 2, axis = 1)
174
+ stack_lamb_device = cp.maximum(cp.minimum(cp.array(stack_lamb_device), 1), 0)
175
+ scaling[n_block, threads_per_block](stack_scaling_mats_device, stack_lamb_device)
176
+ stack_s_shrink = s * stack_scaling_mats_device
177
+ del stack_lamb_device
178
+ cuda.synchronize()
179
+
180
+ return stack_s_shrink
181
+
182
+
@@ -0,0 +1,12 @@
1
+
2
+ import numpy as np
3
+
4
+ def set_mask_cpu(neighbors, brain_1d_indexes):
5
+ """
6
+ Set neighbor mask(iterate over all neighbors)
7
+
8
+ :param neighbors(np.array - shape: (#center, #neighbor)): list of neighbor
9
+ :param brain_1d_indexes(np.array - shape: (#channel)): 1d location index converted from 3D brain coordinate (x,y,z)
10
+ :param out: masked_residual(np.array - shape: (#center, #channel)): output device memory
11
+ """
12
+ return np.array([np.where(np.isin(brain_1d_indexes, target_neighbor), 1, 0) for target_neighbor in neighbors])
@@ -0,0 +1,66 @@
1
+
2
+ import numpy as np
3
+
4
+ def convert_1d_to_symmertic(a_1d, size, k = 0, dtype = np.float32):
5
+ """
6
+ Convert 1d array to symmetric matrix
7
+
8
+ :param a_1d(1d array):
9
+ :param size: matrix size
10
+ :param k(int): offset
11
+
12
+ return (np.array)
13
+ """
14
+
15
+ # put it back into a 2D symmetric array
16
+
17
+ X = np.zeros((size,size), dtype = dtype)
18
+ X[np.triu_indices(size, k = 0)] = a_1d
19
+ X = X + X.T - np.diag(np.diag(X))
20
+
21
+ return X
22
+
23
+ def mean_fold_variance(variances, fold_info):
24
+ """
25
+ Calculate fold variacne from fold info
26
+
27
+ :param variances: variances (#run, #cov.shape)
28
+ :param fold_info(2d array): fold information - [[fold1, fold2], ...]
29
+
30
+ return (np.array) - (#run * (#runC2), cov.shape)
31
+ """
32
+ n_d = len(variances)
33
+
34
+ result_variances = []
35
+ for i in range(n_d):
36
+ for fold1_i, fold2_i in fold_info:
37
+ cov1 = variances[i][fold1_i]
38
+ cov2 = variances[i][fold2_i]
39
+
40
+ result_variances.append((cov1 + cov2) / 2)
41
+
42
+ return np.array(result_variances)
43
+
44
+ def reconstruct_sl_precisionMats(sl_precisions: np.ndarray, n_neighbor: int) -> np.ndarray:
45
+ """
46
+ Reconstruct searchlight precision matrix from 1d(combination(n_neighbor, 2)) into 2d(n_neighbor * n_neighbor)
47
+
48
+ :param sl_precisions(shape - #center, #source, #element): array of precision matrices
49
+ :param n_neighbor: a number of neighbor of source from precision matrix
50
+
51
+ return (shape - #center * n_source, n_spaital_dim, n_spatial_dim)
52
+ """
53
+ n_center, n_source, n_element = sl_precisions.shape
54
+ n_batch = n_center * n_source
55
+
56
+ # Indices
57
+ r, c = np.triu_indices(n_neighbor, k = 0)
58
+ off = (r != c)
59
+
60
+ # Reconstruct matrix
61
+ dummy = np.zeros((n_batch, n_neighbor, n_neighbor))
62
+ packed = sl_precisions.reshape(n_batch, n_element)
63
+ dummy[:, r, c] = packed # Allocate upper triangle elements
64
+ dummy[:, c[off], r[off]] = packed[:, off] # Allocate lower triangle elements
65
+
66
+ return dummy
@@ -0,0 +1,234 @@
1
+
2
+ from numba import cuda, jit
3
+ from numba.cuda.cudadrv.devicearray import DeviceNDArray
4
+
5
+ @cuda.jit
6
+ def calc_outerProduct(vec1, vec2, out):
7
+ """
8
+ Calculate outer product between vector1 and vector2.
9
+ This is same as np.outer(vec1, vec2).
10
+
11
+ :param vec1(np.array): vector1
12
+ :param vec2(np.array): vector2
13
+ :param out(cuda.cudadrv.devicearray.DeviceNDArray - shape: (#vec1_component, #vec2_component)): output array to store outer product result
14
+ """
15
+ i = cuda.grid(1)
16
+
17
+ for j, e1 in enumerate(vec1):
18
+ for k, e2 in enumerate(vec2):
19
+ out[j][k] = e1 * e2
20
+
21
+ @cuda.jit
22
+ def outer_sum(matrices, out):
23
+ """
24
+ Calculate outer product and accumulating the result
25
+
26
+ 1. Calculate outer product to each data
27
+ - np.outer(data, data): the data is same
28
+ 2. Accumulate outer result to output array iterating over all datas
29
+
30
+ :param matrices(np.array - shape: (#run, #data, #channel)): measurement matrices
31
+ :param out(cuda.cudadrv.devicearray.DeviceNDArray - shape: (shape: (#run, #channel, #channel))): output array to store data after calculation
32
+ """
33
+ i = cuda.grid(1)
34
+
35
+ if i < len(matrices):
36
+ matrix = matrices[i]
37
+
38
+ for m_line in matrix:
39
+ for j, e1 in enumerate(m_line):
40
+ for k, e2 in enumerate(m_line):
41
+ out[i][j][k] += e1 * e2
42
+
43
+ @cuda.jit
44
+ def outer_sum_square(matrices, out):
45
+ """
46
+ Calculate outer product, square, and accumulating the result
47
+
48
+ 1. Calculate outer product to each data
49
+ - np.outer(data, data): the data is same
50
+ 2. Calculate square product on the result
51
+ 3. Accumulate outer result to output array iterating over all datas
52
+
53
+ :param matrices(np.array - shape: (#run, #data, #channel)): measurement matrices
54
+ :param out(cuda.cudadrv.devicearray.DeviceNDArray - shape: (shape: (#run, #channel, #channel))): output array to store data after calculation
55
+ """
56
+ i = cuda.grid(1)
57
+
58
+ if i < len(matrices):
59
+ matrix = matrices[i]
60
+
61
+ for m_line in matrix:
62
+ for j, e1 in enumerate(m_line):
63
+ for k, e2 in enumerate(m_line):
64
+ out[i][j][k] += (e1 * e2) ** 2
65
+
66
+ @cuda.jit
67
+ def scaling(out, lambs):
68
+ i = cuda.grid(1)
69
+ lamb = lambs[i]
70
+
71
+ nd = out.shape[0]
72
+ nr = out.shape[1]
73
+ nc = out.shape[2]
74
+
75
+ if i < len(out):
76
+ for j in range(nr):
77
+ for k in range(nc):
78
+ if j != k:
79
+ out[i][j][k] = (1 - lamb)
80
+
81
+ @cuda.jit(device=True, inline=True)
82
+ def matmul(a: DeviceNDArray,
83
+ b: DeviceNDArray,
84
+ out: DeviceNDArray):
85
+ """
86
+ Matrix multiplication a @ b
87
+
88
+ :param a(shape - 2d): 2d matrix
89
+ :param b(shape - 2d): 2d matrix
90
+ :param out(shape - 2d): output
91
+ """
92
+ ar,ac = a.shape
93
+ br,bc = b.shape
94
+
95
+ for i in range(ar):
96
+ for j in range(bc):
97
+ for k in range(ac): # or br
98
+ out[i,j] += a[i,k] * b[k,j]
99
+
100
+ def matmul2(a: DeviceNDArray,
101
+ b: DeviceNDArray,
102
+ out: DeviceNDArray):
103
+ """
104
+ Matrix multiplication - vector @ array(2d)
105
+
106
+ :param a(shape: 1d): vector
107
+ :param b(shape: 2d): 2d array
108
+ :param output(shape: 1d): output array
109
+ """
110
+ n_component_A = a.shape[0]
111
+ n_row_B, n_col_B = b.shape
112
+
113
+ for i in range(n_component_A):
114
+ for j in range(n_row_B):
115
+ out[i] += a[j] * b[j, i]
116
+
117
+ @cuda.jit(device=True, inline=True)
118
+ def matmul_upperTmat(vector: DeviceNDArray,
119
+ upperTmat: DeviceNDArray,
120
+ mul_mapping: DeviceNDArray,
121
+ output: DeviceNDArray):
122
+ """
123
+ Multiply matrix with vector and symmetric matrix
124
+
125
+ calculation: vector @ upperTmat
126
+
127
+ :param vector(shape : #element): vector
128
+ :param upperTmat(shape: #comb(#element, 2)): upper triangle matrix including diagnoal element
129
+ :param mul_mapping(shape: (#element, #element)): index mapping to be multiplied
130
+ :param output(shape: #element): output array
131
+ """
132
+ for vec_i in range(len(vector)):
133
+ for row_i in range(len(mul_mapping)):
134
+ output[vec_i] += vector[row_i] * upperTmat[mul_mapping[row_i, vec_i]]
135
+
136
+ @cuda.jit(device=True, inline=True)
137
+ def minus(a: DeviceNDArray,
138
+ b: DeviceNDArray,
139
+ out: DeviceNDArray):
140
+ """
141
+ Matrix multiplication a @ b
142
+
143
+ :param a(shape - 1d): 1d matrix
144
+ :param b(shape - 1d): 1d matrix
145
+ :param out(shape - 1d): output
146
+ """
147
+ n_a = len(a)
148
+
149
+ for i in range(n_a):
150
+ out[i] = a[i] - b[i]
151
+
152
+ @cuda.jit(device=True, inline=True)
153
+ def dot(a1, a2, output, output_i):
154
+ for i in range(len(a1)):
155
+ output[output_i] += a1[i] * a2[i]
156
+
157
+ if __name__ == "__main__":
158
+ dummy_data = np.array([
159
+ [
160
+ [1,2,3],
161
+ [4,5,6],
162
+ [5,6,7],
163
+ ],
164
+ [
165
+ [7,8,9],
166
+ [0,1,2],
167
+ [3,4,5],
168
+ ],
169
+ ])
170
+ n_run, n_point, n_channel = dummy_data.shape
171
+ calc_outerProduct[1,1](dummy_data[0][0], dummy_data[0][1], out)
172
+
173
+ out_sum_device = cuda.to_device(np.zeros((n_run, n_channel, n_channel)))
174
+ outer_sum[1,1](dummy_data, out_sum_device)
175
+
176
+ out_sum_device = cuda.to_device(np.zeros((n_run, n_channel, n_channel)))
177
+ outer_sum_square[1,1](dummy_data, out_sum_device)
178
+
179
+ # Matmul 1
180
+ input_ = cuda.to_device(np.array([[1,2,3]]))
181
+ array = cuda.to_device(np.arange(1, 10).reshape(3,3))
182
+ result = cuda.to_device(np.zeros((3,3)).reshape(3,3))
183
+ matmul(input_, array, result)
184
+
185
+ # Matmul 2
186
+ input_ = cuda.to_device(np.array([1,2,3]))
187
+ array = cuda.to_device(np.arange(1, 10).reshape(3,3))
188
+ result = cuda.to_device(np.zeros(3))
189
+ matmul2(input_, array, result)
190
+
191
+ # Matmul upper triangle mat
192
+ vector = cuda.to_device(np.array([1,2,3]))
193
+ n_element = vector.shape[0]
194
+ r_, c_ = np.triu_indices(n_element, k = 0)
195
+ upperTmat = np.array([
196
+ [1,2,3],
197
+ [2,4,5],
198
+ [3,5,2],
199
+ ])
200
+ upperTmat = cuda.to_device(upperTmat[r_, c_])
201
+
202
+ mul_mapping = np.zeros((n_element, n_element))
203
+ mul_mapping[r_, c_] = np.arange(r_.shape[0])
204
+ mul_mapping += mul_mapping.T
205
+ idx = np.diag_indices(mul_mapping.shape[0])
206
+ mul_mapping[idx] = mul_mapping[idx] / 2
207
+ mul_mapping = mul_mapping.astype(int)
208
+ mul_mapping = cuda.to_device(mul_mapping)
209
+ output = cuda.to_device(np.zeros(n_element))
210
+
211
+ matmul_upperTmat(vector = vector,
212
+ upperTmat = upperTmat,
213
+ mul_mapping = mul_mapping,
214
+ output = output)
215
+
216
+ # Minus
217
+ a = cuda.to_device(np.ones(3))
218
+ b = cuda.to_device(np.ones(3) * 2)
219
+ output = cuda.to_device(np.zeros(3))
220
+
221
+ @cuda.jit
222
+ def minus_jit(a, b, out):
223
+ minus(a, b, out)
224
+ minus_jit[1,1](a, b, output)
225
+
226
+ # Dot product
227
+ a1 = np.array([1,2,3])
228
+ a2 = np.array([1,2,3])
229
+ output = np.array([0])
230
+
231
+ @cuda.jit
232
+ def dot_test(a1, a2, output):
233
+ dot(a1, a2, output, 0)
234
+ dot_test[1,1](a1, a2, output)
@@ -2,12 +2,12 @@
2
2
  from numba import cuda, jit
3
3
 
4
4
  @cuda.jit
5
- def set_mask(neighbors, brain_1d_indexes, out):
5
+ def set_mask_gpu(neighbors, brain_1d_indexes, out):
6
6
  """
7
7
  Set neighbor mask(iterate over all neighbors)
8
8
 
9
9
  :param neighbors(np.array): list of neighbor , shape: (#center, #neighbor)
10
- :param brain_1d_indexes(np.array): , shape: #channel
10
+ :param brain_1d_indexes(np.array): 1d location converted from 3D brain coordinate (x,y,z) , shape: #channel
11
11
  :param out: masked_residual, output device memory , shape: (#center, #channel)
12
12
  """
13
13
  i = cuda.grid(1)
@@ -19,5 +19,12 @@ def set_mask(neighbors, brain_1d_indexes, out):
19
19
  for brain_i, brain_pos in enumerate(brain_1d_indexes):
20
20
  if brain_pos == neighbor_pos:
21
21
  out[i][brain_i] = 1
22
-
23
-
22
+ if __name__ == "__main__":
23
+ n_split_data = 2024
24
+ n_channel = 200000
25
+ n_thread_per_block = 1024
26
+
27
+ n_block = int(np.ceil(n_split_data / n_thread_per_block))
28
+ mask_out = cuda.to_device(np.zeros((n_split_data, n_channel)))
29
+ target_neighbors = neighbors[0:0 + n_split_data, :]
30
+ set_mask[n_block, n_thread_per_block](target_neighbors, masking_indexes, mask_out)