boostrsa 0.0.1.dev5__tar.gz → 0.0.1.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {boostrsa-0.0.1.dev5/src/boostrsa.egg-info → boostrsa-0.0.1.dev8}/PKG-INFO +1 -1
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/setup.py +1 -3
- boostrsa-0.0.1.dev8/src/boostrsa/cores/cpgpu/stats.py +182 -0
- boostrsa-0.0.1.dev8/src/boostrsa/cores/cpu/mask.py +12 -0
- boostrsa-0.0.1.dev8/src/boostrsa/cores/cpu/matrix.py +66 -0
- boostrsa-0.0.1.dev8/src/boostrsa/cores/gpu/basic_operations.py +234 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/gpu/mask.py +11 -4
- boostrsa-0.0.1.dev8/src/boostrsa/cores/gpu/matrix.py +230 -0
- boostrsa-0.0.1.dev8/src/boostrsa/searchlight.py +929 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8/src/boostrsa.egg-info}/PKG-INFO +1 -1
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/SOURCES.txt +1 -0
- boostrsa-0.0.1.dev5/src/boostrsa/cores/cpgpu/stats.py +0 -130
- boostrsa-0.0.1.dev5/src/boostrsa/cores/cpu/matrix.py +0 -44
- boostrsa-0.0.1.dev5/src/boostrsa/cores/gpu/basic_operations.py +0 -61
- boostrsa-0.0.1.dev5/src/boostrsa/cores/gpu/matrix.py +0 -125
- boostrsa-0.0.1.dev5/src/boostrsa/searchlight.py +0 -233
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/LICENSE.txt +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/README.md +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/setup.cfg +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/__init__.py +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/boostrsa_types.py +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/__init__.py +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/cpgpu/__init__.py +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/cpu/__init__.py +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa/cores/gpu/__init__.py +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/dependency_links.txt +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/requires.txt +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/src/boostrsa.egg-info/top_level.txt +0 -0
- {boostrsa-0.0.1.dev5 → boostrsa-0.0.1.dev8}/tests/test_module1.py +0 -0
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
import subprocess
|
|
3
|
-
import re
|
|
4
2
|
from setuptools import setup, find_packages
|
|
5
3
|
|
|
6
4
|
with open("README.md", "r") as fh:
|
|
@@ -8,7 +6,7 @@ with open("README.md", "r") as fh:
|
|
|
8
6
|
|
|
9
7
|
setup(
|
|
10
8
|
name = "boostrsa",
|
|
11
|
-
version = "0.0.
|
|
9
|
+
version = "0.0.1dev8",
|
|
12
10
|
author = "seojin",
|
|
13
11
|
author_email = "pures1@hanyang.ac.kr",
|
|
14
12
|
description = "This is toolbox for boosting calculation speed using GPU",
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import numpy as np
|
|
5
|
+
import cupy as cp
|
|
6
|
+
from numba import cuda, jit
|
|
7
|
+
|
|
8
|
+
if os.getenv("boostrsa_isRunSource"):
|
|
9
|
+
sys.path.append(os.getenv("boostrsa_source_home"))
|
|
10
|
+
from boostrsa_types import ShrinkageMethod
|
|
11
|
+
from cores.gpu.basic_operations import outer_sum_square, outer_sum
|
|
12
|
+
from cores.gpu.matrix import diag, eyes
|
|
13
|
+
from cores.gpu.basic_operations import scaling
|
|
14
|
+
else:
|
|
15
|
+
from boostrsa.boostrsa_types import ShrinkageMethod
|
|
16
|
+
from boostrsa.cores.gpu.basic_operations import outer_sum_square, outer_sum
|
|
17
|
+
from boostrsa.cores.gpu.matrix import diag, eyes
|
|
18
|
+
from boostrsa.cores.gpu.basic_operations import scaling
|
|
19
|
+
|
|
20
|
+
def _covariance_eye(residuals: np.ndarray,
|
|
21
|
+
threads_per_block = 1024,
|
|
22
|
+
dtype = np.float32):
|
|
23
|
+
"""
|
|
24
|
+
Computes an optimal shrinkage estimate of a sample covariance matrix as described by the following publication:
|
|
25
|
+
**matrix should be demeaned before!
|
|
26
|
+
|
|
27
|
+
Ledoit and Wolfe (2004): "A well-conditioned estimator for large-dimensional covariance matrices"
|
|
28
|
+
|
|
29
|
+
:param residuals: residual data after processing raw data, shape: (#run * #center, #point, #channel)
|
|
30
|
+
:param threads_per_block: #thread per GPU block
|
|
31
|
+
:param dtype: data type for storing array
|
|
32
|
+
"""
|
|
33
|
+
# Constant
|
|
34
|
+
n_processing_unit = len(residuals)
|
|
35
|
+
n_point = residuals.shape[1]
|
|
36
|
+
n_channel = residuals.shape[2]
|
|
37
|
+
|
|
38
|
+
n_block = int(np.ceil(n_processing_unit / threads_per_block))
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
1. calculate outer product per data of each time
|
|
42
|
+
2. accumulate the outer product result
|
|
43
|
+
|
|
44
|
+
GPU memory capacity - (#run * #center, #channel, #channel)
|
|
45
|
+
"""
|
|
46
|
+
out_sum_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
|
|
47
|
+
outer_sum[n_block, threads_per_block](residuals, out_sum_device)
|
|
48
|
+
outer_sum_result = out_sum_device.copy_to_host()
|
|
49
|
+
del out_sum_device
|
|
50
|
+
cuda.synchronize()
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
1. calculate outer product per data of each time
|
|
54
|
+
2. accumulate the outer product result with square operation
|
|
55
|
+
|
|
56
|
+
GPU memory capacity - (#run * #center, #channel, #channel)
|
|
57
|
+
"""
|
|
58
|
+
out_sum_square_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
|
|
59
|
+
outer_sum_square[n_block, threads_per_block](residuals, out_sum_square_device)
|
|
60
|
+
outer_sum_square_result = out_sum_square_device.copy_to_host()
|
|
61
|
+
del out_sum_square_device
|
|
62
|
+
cuda.synchronize()
|
|
63
|
+
|
|
64
|
+
# b2
|
|
65
|
+
s = outer_sum_result / n_point
|
|
66
|
+
s2 = outer_sum_square_result / n_point
|
|
67
|
+
b2 = np.sum(s2 - s * s, axis = (1, 2)) / n_point
|
|
68
|
+
|
|
69
|
+
# calculate the scalar estimators to find the optimal shrinkage:
|
|
70
|
+
# m, d^2, b^2 as in Ledoit & Wolfe paper
|
|
71
|
+
# m - shape: (n_processing_unit)
|
|
72
|
+
# d2 - shape: (n_processing_unit)
|
|
73
|
+
# b2 - shape: (n_processing_unit)
|
|
74
|
+
repeat_eyes = np.repeat(np.eye(n_channel)[:, :, np.newaxis], n_processing_unit, axis = 2).T
|
|
75
|
+
|
|
76
|
+
diag_s = np.diagonal(s, axis1 = 1, axis2 = 2)
|
|
77
|
+
m = (np.sum(diag_s, axis = 1) / n_channel)
|
|
78
|
+
d2 = np.sum((s - m[:, None, None] * repeat_eyes) ** 2, axis = (1, 2))
|
|
79
|
+
|
|
80
|
+
b2 = np.minimum(d2, b2)
|
|
81
|
+
|
|
82
|
+
# shrink covariance matrix
|
|
83
|
+
s_shrink = (b2 / d2 * m)[:, None, None] * repeat_eyes + ((d2-b2) / d2)[:, None, None] * s
|
|
84
|
+
|
|
85
|
+
# correction for degrees of freedom
|
|
86
|
+
dof = n_point - 1
|
|
87
|
+
s_shrink = s_shrink * n_point / dof
|
|
88
|
+
|
|
89
|
+
return s_shrink
|
|
90
|
+
|
|
91
|
+
def _covariance_diag(residuals: np.ndarray,
|
|
92
|
+
threads_per_block: int = 1024,
|
|
93
|
+
dtype = np.float32):
|
|
94
|
+
"""
|
|
95
|
+
Calculate covariance
|
|
96
|
+
|
|
97
|
+
Schäfer, J., & Strimmer, K. (2005). "A Shrinkage Approach to Large-Scale Covariance Matrix Estimation and Implications for Functional Genomics.
|
|
98
|
+
|
|
99
|
+
:param residuals: residual data after processing raw data, shape: (#run * #center, #point, #channel)
|
|
100
|
+
:param threads_per_block: #thread per GPU block
|
|
101
|
+
:param dtype: data type for storing array
|
|
102
|
+
"""
|
|
103
|
+
# Constant
|
|
104
|
+
n_processing_unit = len(residuals)
|
|
105
|
+
n_point = residuals.shape[1]
|
|
106
|
+
n_channel = residuals.shape[2]
|
|
107
|
+
|
|
108
|
+
n_block = int(np.ceil(n_processing_unit / threads_per_block))
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
1. calculate outer product per data of each time
|
|
112
|
+
2. accumulate the outer product result
|
|
113
|
+
|
|
114
|
+
GPU memory capacity: (shape: #run * #center * #channel * #channel)
|
|
115
|
+
"""
|
|
116
|
+
out_sum_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
|
|
117
|
+
outer_sum[n_block, threads_per_block](residuals, out_sum_device)
|
|
118
|
+
outer_sum_result = out_sum_device.copy_to_host()
|
|
119
|
+
del out_sum_device
|
|
120
|
+
cuda.synchronize()
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
1. calculate outer product per data of each time
|
|
124
|
+
2. accumulate the outer product result with square operation
|
|
125
|
+
|
|
126
|
+
GPU memory capacity: (shape: #run * #center * #channel * #channel)
|
|
127
|
+
"""
|
|
128
|
+
out_sum_square_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
|
|
129
|
+
outer_sum_square[n_block, threads_per_block](residuals, out_sum_square_device)
|
|
130
|
+
outer_sum_square_result = out_sum_square_device.copy_to_host()
|
|
131
|
+
del out_sum_square_device
|
|
132
|
+
cuda.synchronize()
|
|
133
|
+
|
|
134
|
+
# s
|
|
135
|
+
dof = n_point - 1
|
|
136
|
+
s = outer_sum_result / dof
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
Calculate variance per each channel & run
|
|
140
|
+
|
|
141
|
+
GPU memory capacity: (shape: #run * #center * #channel * dataType)
|
|
142
|
+
"""
|
|
143
|
+
stack_var_device = cuda.to_device(np.zeros((n_processing_unit, n_channel)))
|
|
144
|
+
diag[n_block, threads_per_block](s, stack_var_device)
|
|
145
|
+
stack_var = stack_var_device.copy_to_host()
|
|
146
|
+
del stack_var_device
|
|
147
|
+
cuda.synchronize()
|
|
148
|
+
|
|
149
|
+
# std
|
|
150
|
+
stack_std = np.sqrt(stack_var)
|
|
151
|
+
|
|
152
|
+
# sum mean
|
|
153
|
+
stack_s_mean = outer_sum_result / np.expand_dims(stack_std, 1) / np.expand_dims(stack_std, 2) / (n_point - 1)
|
|
154
|
+
|
|
155
|
+
# s2 mean
|
|
156
|
+
stack_s2_mean = outer_sum_square_result / np.expand_dims(stack_var, 1) / np.expand_dims(stack_var, 2) / (n_point - 1)
|
|
157
|
+
|
|
158
|
+
# var_hat
|
|
159
|
+
stack_var_hat = n_point / dof ** 2 * (stack_s2_mean - stack_s_mean ** 2)
|
|
160
|
+
|
|
161
|
+
# mask
|
|
162
|
+
mask = ~np.eye(n_channel, dtype = bool)
|
|
163
|
+
|
|
164
|
+
"""
|
|
165
|
+
Scaling
|
|
166
|
+
|
|
167
|
+
GPU memory capacity: (shape: #run * #center * #channel * #channel * dataType)
|
|
168
|
+
"""
|
|
169
|
+
stack_scaling_mats_device = cuda.to_device(np.zeros((n_processing_unit, n_channel, n_channel), dtype = dtype))
|
|
170
|
+
eyes[n_block, threads_per_block](stack_scaling_mats_device)
|
|
171
|
+
cuda.synchronize()
|
|
172
|
+
|
|
173
|
+
stack_lamb_device = np.sum(stack_var_hat[:, mask], axis = 1) / np.sum(stack_s_mean[:, mask] ** 2, axis = 1)
|
|
174
|
+
stack_lamb_device = cp.maximum(cp.minimum(cp.array(stack_lamb_device), 1), 0)
|
|
175
|
+
scaling[n_block, threads_per_block](stack_scaling_mats_device, stack_lamb_device)
|
|
176
|
+
stack_s_shrink = s * stack_scaling_mats_device
|
|
177
|
+
del stack_lamb_device
|
|
178
|
+
cuda.synchronize()
|
|
179
|
+
|
|
180
|
+
return stack_s_shrink
|
|
181
|
+
|
|
182
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def set_mask_cpu(neighbors, brain_1d_indexes):
|
|
5
|
+
"""
|
|
6
|
+
Set neighbor mask(iterate over all neighbors)
|
|
7
|
+
|
|
8
|
+
:param neighbors(np.array - shape: (#center, #neighbor)): list of neighbor
|
|
9
|
+
:param brain_1d_indexes(np.array - shape: (#channel)): 1d location index converted from 3D brain coordinate (x,y,z)
|
|
10
|
+
:param out: masked_residual(np.array - shape: (#center, #channel)): output device memory
|
|
11
|
+
"""
|
|
12
|
+
return np.array([np.where(np.isin(brain_1d_indexes, target_neighbor), 1, 0) for target_neighbor in neighbors])
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def convert_1d_to_symmertic(a_1d, size, k = 0, dtype = np.float32):
|
|
5
|
+
"""
|
|
6
|
+
Convert 1d array to symmetric matrix
|
|
7
|
+
|
|
8
|
+
:param a_1d(1d array):
|
|
9
|
+
:param size: matrix size
|
|
10
|
+
:param k(int): offset
|
|
11
|
+
|
|
12
|
+
return (np.array)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# put it back into a 2D symmetric array
|
|
16
|
+
|
|
17
|
+
X = np.zeros((size,size), dtype = dtype)
|
|
18
|
+
X[np.triu_indices(size, k = 0)] = a_1d
|
|
19
|
+
X = X + X.T - np.diag(np.diag(X))
|
|
20
|
+
|
|
21
|
+
return X
|
|
22
|
+
|
|
23
|
+
def mean_fold_variance(variances, fold_info):
|
|
24
|
+
"""
|
|
25
|
+
Calculate fold variacne from fold info
|
|
26
|
+
|
|
27
|
+
:param variances: variances (#run, #cov.shape)
|
|
28
|
+
:param fold_info(2d array): fold information - [[fold1, fold2], ...]
|
|
29
|
+
|
|
30
|
+
return (np.array) - (#run * (#runC2), cov.shape)
|
|
31
|
+
"""
|
|
32
|
+
n_d = len(variances)
|
|
33
|
+
|
|
34
|
+
result_variances = []
|
|
35
|
+
for i in range(n_d):
|
|
36
|
+
for fold1_i, fold2_i in fold_info:
|
|
37
|
+
cov1 = variances[i][fold1_i]
|
|
38
|
+
cov2 = variances[i][fold2_i]
|
|
39
|
+
|
|
40
|
+
result_variances.append((cov1 + cov2) / 2)
|
|
41
|
+
|
|
42
|
+
return np.array(result_variances)
|
|
43
|
+
|
|
44
|
+
def reconstruct_sl_precisionMats(sl_precisions: np.ndarray, n_neighbor: int) -> np.ndarray:
|
|
45
|
+
"""
|
|
46
|
+
Reconstruct searchlight precision matrix from 1d(combination(n_neighbor, 2)) into 2d(n_neighbor * n_neighbor)
|
|
47
|
+
|
|
48
|
+
:param sl_precisions(shape - #center, #source, #element): array of precision matrices
|
|
49
|
+
:param n_neighbor: a number of neighbor of source from precision matrix
|
|
50
|
+
|
|
51
|
+
return (shape - #center * n_source, n_spaital_dim, n_spatial_dim)
|
|
52
|
+
"""
|
|
53
|
+
n_center, n_source, n_element = sl_precisions.shape
|
|
54
|
+
n_batch = n_center * n_source
|
|
55
|
+
|
|
56
|
+
# Indices
|
|
57
|
+
r, c = np.triu_indices(n_neighbor, k = 0)
|
|
58
|
+
off = (r != c)
|
|
59
|
+
|
|
60
|
+
# Reconstruct matrix
|
|
61
|
+
dummy = np.zeros((n_batch, n_neighbor, n_neighbor))
|
|
62
|
+
packed = sl_precisions.reshape(n_batch, n_element)
|
|
63
|
+
dummy[:, r, c] = packed # Allocate upper triangle elements
|
|
64
|
+
dummy[:, c[off], r[off]] = packed[:, off] # Allocate lower triangle elements
|
|
65
|
+
|
|
66
|
+
return dummy
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
|
|
2
|
+
from numba import cuda, jit
|
|
3
|
+
from numba.cuda.cudadrv.devicearray import DeviceNDArray
|
|
4
|
+
|
|
5
|
+
@cuda.jit
|
|
6
|
+
def calc_outerProduct(vec1, vec2, out):
|
|
7
|
+
"""
|
|
8
|
+
Calculate outer product between vector1 and vector2.
|
|
9
|
+
This is same as np.outer(vec1, vec2).
|
|
10
|
+
|
|
11
|
+
:param vec1(np.array): vector1
|
|
12
|
+
:param vec2(np.array): vector2
|
|
13
|
+
:param out(cuda.cudadrv.devicearray.DeviceNDArray - shape: (#vec1_component, #vec2_component)): output array to store outer product result
|
|
14
|
+
"""
|
|
15
|
+
i = cuda.grid(1)
|
|
16
|
+
|
|
17
|
+
for j, e1 in enumerate(vec1):
|
|
18
|
+
for k, e2 in enumerate(vec2):
|
|
19
|
+
out[j][k] = e1 * e2
|
|
20
|
+
|
|
21
|
+
@cuda.jit
|
|
22
|
+
def outer_sum(matrices, out):
|
|
23
|
+
"""
|
|
24
|
+
Calculate outer product and accumulating the result
|
|
25
|
+
|
|
26
|
+
1. Calculate outer product to each data
|
|
27
|
+
- np.outer(data, data): the data is same
|
|
28
|
+
2. Accumulate outer result to output array iterating over all datas
|
|
29
|
+
|
|
30
|
+
:param matrices(np.array - shape: (#run, #data, #channel)): measurement matrices
|
|
31
|
+
:param out(cuda.cudadrv.devicearray.DeviceNDArray - shape: (shape: (#run, #channel, #channel))): output array to store data after calculation
|
|
32
|
+
"""
|
|
33
|
+
i = cuda.grid(1)
|
|
34
|
+
|
|
35
|
+
if i < len(matrices):
|
|
36
|
+
matrix = matrices[i]
|
|
37
|
+
|
|
38
|
+
for m_line in matrix:
|
|
39
|
+
for j, e1 in enumerate(m_line):
|
|
40
|
+
for k, e2 in enumerate(m_line):
|
|
41
|
+
out[i][j][k] += e1 * e2
|
|
42
|
+
|
|
43
|
+
@cuda.jit
|
|
44
|
+
def outer_sum_square(matrices, out):
|
|
45
|
+
"""
|
|
46
|
+
Calculate outer product, square, and accumulating the result
|
|
47
|
+
|
|
48
|
+
1. Calculate outer product to each data
|
|
49
|
+
- np.outer(data, data): the data is same
|
|
50
|
+
2. Calculate square product on the result
|
|
51
|
+
3. Accumulate outer result to output array iterating over all datas
|
|
52
|
+
|
|
53
|
+
:param matrices(np.array - shape: (#run, #data, #channel)): measurement matrices
|
|
54
|
+
:param out(cuda.cudadrv.devicearray.DeviceNDArray - shape: (shape: (#run, #channel, #channel))): output array to store data after calculation
|
|
55
|
+
"""
|
|
56
|
+
i = cuda.grid(1)
|
|
57
|
+
|
|
58
|
+
if i < len(matrices):
|
|
59
|
+
matrix = matrices[i]
|
|
60
|
+
|
|
61
|
+
for m_line in matrix:
|
|
62
|
+
for j, e1 in enumerate(m_line):
|
|
63
|
+
for k, e2 in enumerate(m_line):
|
|
64
|
+
out[i][j][k] += (e1 * e2) ** 2
|
|
65
|
+
|
|
66
|
+
@cuda.jit
|
|
67
|
+
def scaling(out, lambs):
|
|
68
|
+
i = cuda.grid(1)
|
|
69
|
+
lamb = lambs[i]
|
|
70
|
+
|
|
71
|
+
nd = out.shape[0]
|
|
72
|
+
nr = out.shape[1]
|
|
73
|
+
nc = out.shape[2]
|
|
74
|
+
|
|
75
|
+
if i < len(out):
|
|
76
|
+
for j in range(nr):
|
|
77
|
+
for k in range(nc):
|
|
78
|
+
if j != k:
|
|
79
|
+
out[i][j][k] = (1 - lamb)
|
|
80
|
+
|
|
81
|
+
@cuda.jit(device=True, inline=True)
|
|
82
|
+
def matmul(a: DeviceNDArray,
|
|
83
|
+
b: DeviceNDArray,
|
|
84
|
+
out: DeviceNDArray):
|
|
85
|
+
"""
|
|
86
|
+
Matrix multiplication a @ b
|
|
87
|
+
|
|
88
|
+
:param a(shape - 2d): 2d matrix
|
|
89
|
+
:param b(shape - 2d): 2d matrix
|
|
90
|
+
:param out(shape - 2d): output
|
|
91
|
+
"""
|
|
92
|
+
ar,ac = a.shape
|
|
93
|
+
br,bc = b.shape
|
|
94
|
+
|
|
95
|
+
for i in range(ar):
|
|
96
|
+
for j in range(bc):
|
|
97
|
+
for k in range(ac): # or br
|
|
98
|
+
out[i,j] += a[i,k] * b[k,j]
|
|
99
|
+
|
|
100
|
+
def matmul2(a: DeviceNDArray,
|
|
101
|
+
b: DeviceNDArray,
|
|
102
|
+
out: DeviceNDArray):
|
|
103
|
+
"""
|
|
104
|
+
Matrix multiplication - vector @ array(2d)
|
|
105
|
+
|
|
106
|
+
:param a(shape: 1d): vector
|
|
107
|
+
:param b(shape: 2d): 2d array
|
|
108
|
+
:param output(shape: 1d): output array
|
|
109
|
+
"""
|
|
110
|
+
n_component_A = a.shape[0]
|
|
111
|
+
n_row_B, n_col_B = b.shape
|
|
112
|
+
|
|
113
|
+
for i in range(n_component_A):
|
|
114
|
+
for j in range(n_row_B):
|
|
115
|
+
out[i] += a[j] * b[j, i]
|
|
116
|
+
|
|
117
|
+
@cuda.jit(device=True, inline=True)
|
|
118
|
+
def matmul_upperTmat(vector: DeviceNDArray,
|
|
119
|
+
upperTmat: DeviceNDArray,
|
|
120
|
+
mul_mapping: DeviceNDArray,
|
|
121
|
+
output: DeviceNDArray):
|
|
122
|
+
"""
|
|
123
|
+
Multiply matrix with vector and symmetric matrix
|
|
124
|
+
|
|
125
|
+
calculation: vector @ upperTmat
|
|
126
|
+
|
|
127
|
+
:param vector(shape : #element): vector
|
|
128
|
+
:param upperTmat(shape: #comb(#element, 2)): upper triangle matrix including diagnoal element
|
|
129
|
+
:param mul_mapping(shape: (#element, #element)): index mapping to be multiplied
|
|
130
|
+
:param output(shape: #element): output array
|
|
131
|
+
"""
|
|
132
|
+
for vec_i in range(len(vector)):
|
|
133
|
+
for row_i in range(len(mul_mapping)):
|
|
134
|
+
output[vec_i] += vector[row_i] * upperTmat[mul_mapping[row_i, vec_i]]
|
|
135
|
+
|
|
136
|
+
@cuda.jit(device=True, inline=True)
|
|
137
|
+
def minus(a: DeviceNDArray,
|
|
138
|
+
b: DeviceNDArray,
|
|
139
|
+
out: DeviceNDArray):
|
|
140
|
+
"""
|
|
141
|
+
Matrix multiplication a @ b
|
|
142
|
+
|
|
143
|
+
:param a(shape - 1d): 1d matrix
|
|
144
|
+
:param b(shape - 1d): 1d matrix
|
|
145
|
+
:param out(shape - 1d): output
|
|
146
|
+
"""
|
|
147
|
+
n_a = len(a)
|
|
148
|
+
|
|
149
|
+
for i in range(n_a):
|
|
150
|
+
out[i] = a[i] - b[i]
|
|
151
|
+
|
|
152
|
+
@cuda.jit(device=True, inline=True)
|
|
153
|
+
def dot(a1, a2, output, output_i):
|
|
154
|
+
for i in range(len(a1)):
|
|
155
|
+
output[output_i] += a1[i] * a2[i]
|
|
156
|
+
|
|
157
|
+
if __name__ == "__main__":
|
|
158
|
+
dummy_data = np.array([
|
|
159
|
+
[
|
|
160
|
+
[1,2,3],
|
|
161
|
+
[4,5,6],
|
|
162
|
+
[5,6,7],
|
|
163
|
+
],
|
|
164
|
+
[
|
|
165
|
+
[7,8,9],
|
|
166
|
+
[0,1,2],
|
|
167
|
+
[3,4,5],
|
|
168
|
+
],
|
|
169
|
+
])
|
|
170
|
+
n_run, n_point, n_channel = dummy_data.shape
|
|
171
|
+
calc_outerProduct[1,1](dummy_data[0][0], dummy_data[0][1], out)
|
|
172
|
+
|
|
173
|
+
out_sum_device = cuda.to_device(np.zeros((n_run, n_channel, n_channel)))
|
|
174
|
+
outer_sum[1,1](dummy_data, out_sum_device)
|
|
175
|
+
|
|
176
|
+
out_sum_device = cuda.to_device(np.zeros((n_run, n_channel, n_channel)))
|
|
177
|
+
outer_sum_square[1,1](dummy_data, out_sum_device)
|
|
178
|
+
|
|
179
|
+
# Matmul 1
|
|
180
|
+
input_ = cuda.to_device(np.array([[1,2,3]]))
|
|
181
|
+
array = cuda.to_device(np.arange(1, 10).reshape(3,3))
|
|
182
|
+
result = cuda.to_device(np.zeros((3,3)).reshape(3,3))
|
|
183
|
+
matmul(input_, array, result)
|
|
184
|
+
|
|
185
|
+
# Matmul 2
|
|
186
|
+
input_ = cuda.to_device(np.array([1,2,3]))
|
|
187
|
+
array = cuda.to_device(np.arange(1, 10).reshape(3,3))
|
|
188
|
+
result = cuda.to_device(np.zeros(3))
|
|
189
|
+
matmul2(input_, array, result)
|
|
190
|
+
|
|
191
|
+
# Matmul upper triangle mat
|
|
192
|
+
vector = cuda.to_device(np.array([1,2,3]))
|
|
193
|
+
n_element = vector.shape[0]
|
|
194
|
+
r_, c_ = np.triu_indices(n_element, k = 0)
|
|
195
|
+
upperTmat = np.array([
|
|
196
|
+
[1,2,3],
|
|
197
|
+
[2,4,5],
|
|
198
|
+
[3,5,2],
|
|
199
|
+
])
|
|
200
|
+
upperTmat = cuda.to_device(upperTmat[r_, c_])
|
|
201
|
+
|
|
202
|
+
mul_mapping = np.zeros((n_element, n_element))
|
|
203
|
+
mul_mapping[r_, c_] = np.arange(r_.shape[0])
|
|
204
|
+
mul_mapping += mul_mapping.T
|
|
205
|
+
idx = np.diag_indices(mul_mapping.shape[0])
|
|
206
|
+
mul_mapping[idx] = mul_mapping[idx] / 2
|
|
207
|
+
mul_mapping = mul_mapping.astype(int)
|
|
208
|
+
mul_mapping = cuda.to_device(mul_mapping)
|
|
209
|
+
output = cuda.to_device(np.zeros(n_element))
|
|
210
|
+
|
|
211
|
+
matmul_upperTmat(vector = vector,
|
|
212
|
+
upperTmat = upperTmat,
|
|
213
|
+
mul_mapping = mul_mapping,
|
|
214
|
+
output = output)
|
|
215
|
+
|
|
216
|
+
# Minus
|
|
217
|
+
a = cuda.to_device(np.ones(3))
|
|
218
|
+
b = cuda.to_device(np.ones(3) * 2)
|
|
219
|
+
output = cuda.to_device(np.zeros(3))
|
|
220
|
+
|
|
221
|
+
@cuda.jit
|
|
222
|
+
def minus_jit(a, b, out):
|
|
223
|
+
minus(a, b, out)
|
|
224
|
+
minus_jit[1,1](a, b, output)
|
|
225
|
+
|
|
226
|
+
# Dot product
|
|
227
|
+
a1 = np.array([1,2,3])
|
|
228
|
+
a2 = np.array([1,2,3])
|
|
229
|
+
output = np.array([0])
|
|
230
|
+
|
|
231
|
+
@cuda.jit
|
|
232
|
+
def dot_test(a1, a2, output):
|
|
233
|
+
dot(a1, a2, output, 0)
|
|
234
|
+
dot_test[1,1](a1, a2, output)
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
from numba import cuda, jit
|
|
3
3
|
|
|
4
4
|
@cuda.jit
|
|
5
|
-
def
|
|
5
|
+
def set_mask_gpu(neighbors, brain_1d_indexes, out):
|
|
6
6
|
"""
|
|
7
7
|
Set neighbor mask(iterate over all neighbors)
|
|
8
8
|
|
|
9
9
|
:param neighbors(np.array): list of neighbor , shape: (#center, #neighbor)
|
|
10
|
-
:param brain_1d_indexes(np.array): , shape: #channel
|
|
10
|
+
:param brain_1d_indexes(np.array): 1d location converted from 3D brain coordinate (x,y,z) , shape: #channel
|
|
11
11
|
:param out: masked_residual, output device memory , shape: (#center, #channel)
|
|
12
12
|
"""
|
|
13
13
|
i = cuda.grid(1)
|
|
@@ -19,5 +19,12 @@ def set_mask(neighbors, brain_1d_indexes, out):
|
|
|
19
19
|
for brain_i, brain_pos in enumerate(brain_1d_indexes):
|
|
20
20
|
if brain_pos == neighbor_pos:
|
|
21
21
|
out[i][brain_i] = 1
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
if __name__ == "__main__":
|
|
23
|
+
n_split_data = 2024
|
|
24
|
+
n_channel = 200000
|
|
25
|
+
n_thread_per_block = 1024
|
|
26
|
+
|
|
27
|
+
n_block = int(np.ceil(n_split_data / n_thread_per_block))
|
|
28
|
+
mask_out = cuda.to_device(np.zeros((n_split_data, n_channel)))
|
|
29
|
+
target_neighbors = neighbors[0:0 + n_split_data, :]
|
|
30
|
+
set_mask[n_block, n_thread_per_block](target_neighbors, masking_indexes, mask_out)
|