ipss 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipss/__init__.py +1 -0
- ipss/ipss.py +343 -0
- ipss-0.3.0.dist-info/METADATA +103 -0
- ipss-0.3.0.dist-info/RECORD +8 -0
- ipss-0.3.0.dist-info/WHEEL +5 -0
- ipss-0.3.0.dist-info/top_level.txt +1 -0
- src/__init__.py +0 -0
- src/ipss.py +343 -0
ipss/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .ipss import ipss
|
ipss/ipss.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
# Integrate path stability selection (IPSS)
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
from joblib import Parallel, delayed
|
|
6
|
+
import numpy as np
|
|
7
|
+
from scipy.sparse import csr_matrix
|
|
8
|
+
from sklearn.linear_model import lars_path, Lasso, lasso_path, LogisticRegression
|
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
#--------------------------------
|
|
13
|
+
# IPSS regression
|
|
14
|
+
#--------------------------------
|
|
15
|
+
'''
|
|
16
|
+
Inputs:
|
|
17
|
+
X: n-by-p data matrix (n = number of samples, p = number of features)
|
|
18
|
+
y: n-by-1 response vector (binary or continuous)
|
|
19
|
+
EFP: Target value for expected number of false positives
|
|
20
|
+
cutoff: Positive scalar C that, together with EFP, determines IPSS threshold
|
|
21
|
+
B: Number of subsampling steps
|
|
22
|
+
n_alphas: Number of values in grid of regularization parameters
|
|
23
|
+
q_max: Maximum number of features selected
|
|
24
|
+
Z_sparse: n_alphas-by-B-by-p tensor of subsamples, Z, is output as sparse if 'True'
|
|
25
|
+
lars: Uses least angle regression (LARS) for linear regression if 'True' or lasso if 'False'
|
|
26
|
+
selection_function: Function to apply to the estimated selection probabilities. If equal to
|
|
27
|
+
an integer, m, then function is h_m(x) = (2x - 1)**m if x >= 0.5 and 0 if x < 0.5
|
|
28
|
+
with_stability: Uses stability measure if 'True'
|
|
29
|
+
delta: Scalar value that determines scaling of regularization interval. delta = 1 corresponds
|
|
30
|
+
to log scale, delta = 0 corresponds to linear scale
|
|
31
|
+
'''
|
|
32
|
+
def ipss(X, y,
|
|
33
|
+
EFP=1,
|
|
34
|
+
cutoff=0.05,
|
|
35
|
+
B=50,
|
|
36
|
+
n_alphas=25,
|
|
37
|
+
q_max=None,
|
|
38
|
+
Z_sparse=False,
|
|
39
|
+
lars=False,
|
|
40
|
+
selection_function=None,
|
|
41
|
+
with_stability=False,
|
|
42
|
+
delta=1,
|
|
43
|
+
standardize_X=True,
|
|
44
|
+
center_y=True
|
|
45
|
+
):
|
|
46
|
+
|
|
47
|
+
if len(y.shape) != 1:
|
|
48
|
+
if y.shape[1] == 1:
|
|
49
|
+
y = y.ravel()
|
|
50
|
+
else:
|
|
51
|
+
raise ValueError("Error: Response y must be a numpy array with shape (n,) or (n,1)")
|
|
52
|
+
|
|
53
|
+
if standardize_X:
|
|
54
|
+
X = StandardScaler().fit_transform(X)
|
|
55
|
+
|
|
56
|
+
n, p = X.shape
|
|
57
|
+
n_split = int(n/2)
|
|
58
|
+
|
|
59
|
+
# check if response is binary
|
|
60
|
+
binary_response = (len(np.unique(y)) == 2)
|
|
61
|
+
|
|
62
|
+
# maximum number of features
|
|
63
|
+
if q_max is None:
|
|
64
|
+
q_max = p / 2
|
|
65
|
+
|
|
66
|
+
# compute alphas
|
|
67
|
+
alphas = compute_alphas(X, y, n_alphas, q_max, binary_response)
|
|
68
|
+
|
|
69
|
+
# linear regression
|
|
70
|
+
if not binary_response:
|
|
71
|
+
|
|
72
|
+
if center_y:
|
|
73
|
+
y -= np.mean(y)
|
|
74
|
+
|
|
75
|
+
if lars:
|
|
76
|
+
def process_b(b):
|
|
77
|
+
indices = np.arange(n)
|
|
78
|
+
np.random.shuffle(indices)
|
|
79
|
+
|
|
80
|
+
z = np.empty((n_alphas, 2, p))
|
|
81
|
+
|
|
82
|
+
for half in range(2):
|
|
83
|
+
idx = indices[:n_split] if half == 0 else indices[n_split:]
|
|
84
|
+
X_half, y_half = X[idx,:], y[idx]
|
|
85
|
+
|
|
86
|
+
with warnings.catch_warnings():
|
|
87
|
+
warnings.simplefilter('ignore')
|
|
88
|
+
lars_alphas, _, coefs = lars_path(X_half, y_half, method='lasso')
|
|
89
|
+
|
|
90
|
+
for i, alpha in enumerate(alphas):
|
|
91
|
+
idx_alpha = np.abs(lars_alphas - alpha).argmin()
|
|
92
|
+
coef = coefs[:, idx_alpha]
|
|
93
|
+
|
|
94
|
+
z[i, half, :] = (coef != 0).astype(int)
|
|
95
|
+
|
|
96
|
+
return z
|
|
97
|
+
|
|
98
|
+
else:
|
|
99
|
+
def process_b(b):
|
|
100
|
+
indices = np.arange(n)
|
|
101
|
+
np.random.shuffle(indices)
|
|
102
|
+
|
|
103
|
+
z = np.empty((n_alphas, 2, p))
|
|
104
|
+
|
|
105
|
+
for half in range(2):
|
|
106
|
+
idx = indices[:n_split] if half == 0 else indices[n_split:]
|
|
107
|
+
X_half, y_half = X[idx,:], y[idx]
|
|
108
|
+
|
|
109
|
+
with warnings.catch_warnings():
|
|
110
|
+
warnings.simplefilter('ignore')
|
|
111
|
+
_, coef, _ = lasso_path(X_half, y_half, alphas=alphas)
|
|
112
|
+
z[:, half, :] = (coef.T != 0).astype(int)
|
|
113
|
+
|
|
114
|
+
return z
|
|
115
|
+
|
|
116
|
+
# logistic regression
|
|
117
|
+
else:
|
|
118
|
+
def process_b(b):
|
|
119
|
+
indices = np.arange(n)
|
|
120
|
+
np.random.shuffle(indices)
|
|
121
|
+
|
|
122
|
+
z = np.empty((n_alphas, 2, p))
|
|
123
|
+
|
|
124
|
+
model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
|
|
125
|
+
# model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
|
|
126
|
+
|
|
127
|
+
for half in range(2):
|
|
128
|
+
idx = indices[:n_split] if half == 0 else indices[n_split:]
|
|
129
|
+
X_half, y_half = X[idx,:], y[idx]
|
|
130
|
+
|
|
131
|
+
for i, alpha in enumerate(alphas):
|
|
132
|
+
|
|
133
|
+
model.set_params(C=1/alpha)
|
|
134
|
+
with warnings.catch_warnings(record=True) as w:
|
|
135
|
+
warnings.simplefilter('ignore')
|
|
136
|
+
fit = model.fit(X_half, y_half.ravel())
|
|
137
|
+
z[i, half, :] = (fit.coef_ != 0).astype(int)
|
|
138
|
+
|
|
139
|
+
return z
|
|
140
|
+
|
|
141
|
+
# parallelize subsampling across multiple CPU cores
|
|
142
|
+
results = np.array(Parallel(n_jobs=-1)(delayed(process_b)(b) for b in range(B)))
|
|
143
|
+
|
|
144
|
+
# aggregate results
|
|
145
|
+
Z = np.zeros((n_alphas,2*B,p))
|
|
146
|
+
for b in range(B):
|
|
147
|
+
Z[:, 2*b:2*(b + 1), :] = results[b,:,:,:]
|
|
148
|
+
|
|
149
|
+
# stop at max features, q_max
|
|
150
|
+
stop_index = n_alphas
|
|
151
|
+
average_select = np.empty(n_alphas)
|
|
152
|
+
for i in range(n_alphas):
|
|
153
|
+
z = Z[i,:,:]
|
|
154
|
+
average_select[i] = np.mean(np.sum(z,axis=1))
|
|
155
|
+
if average_select[i] > q_max:
|
|
156
|
+
stop_index = i
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
Z = Z[:stop_index,:,:]
|
|
160
|
+
alphas = alphas[:stop_index]
|
|
161
|
+
average_select = average_select[:stop_index]
|
|
162
|
+
|
|
163
|
+
if Z_sparse:
|
|
164
|
+
Z_sparse = np.empty((stop_index,), dtype=object)
|
|
165
|
+
for i in range(stop_index):
|
|
166
|
+
Z_sparse[i] = csr_matrix(Z[i,:,:])
|
|
167
|
+
Z = Z_sparse
|
|
168
|
+
|
|
169
|
+
# ipss
|
|
170
|
+
if selection_function is None:
|
|
171
|
+
if binary_response:
|
|
172
|
+
selection_function = 2
|
|
173
|
+
else:
|
|
174
|
+
selection_function = 3
|
|
175
|
+
|
|
176
|
+
stability_paths, scores, integral, alphas, stop_index = ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff)
|
|
177
|
+
|
|
178
|
+
threshold = integral / EFP
|
|
179
|
+
selected_features = np.where(scores >= threshold)[0]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
return {'alphas':alphas, 'average_select':average_select, 'scores':scores, 'selected_features':selected_features,
|
|
183
|
+
'stability_paths':stability_paths, 'stop_index':stop_index, 'threshold':threshold}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
#--------------------------------
|
|
187
|
+
# IPSS scores
|
|
188
|
+
#--------------------------------
|
|
189
|
+
def ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff):
|
|
190
|
+
|
|
191
|
+
n_alphas = Z.shape[0]
|
|
192
|
+
B, p = Z[0].shape
|
|
193
|
+
B /= 2
|
|
194
|
+
|
|
195
|
+
# function
|
|
196
|
+
if isinstance(selection_function, (int, float)):
|
|
197
|
+
m = selection_function
|
|
198
|
+
def selection_function(x):
|
|
199
|
+
return 0 if x <= 0.5 else (2*x - 1)**m
|
|
200
|
+
else:
|
|
201
|
+
m = 'user_defined'
|
|
202
|
+
|
|
203
|
+
# stability paths
|
|
204
|
+
stability_paths = np.empty((n_alphas,p))
|
|
205
|
+
for i in range(n_alphas):
|
|
206
|
+
stability_paths[i] = Z[i].mean(axis=0)
|
|
207
|
+
|
|
208
|
+
# stability measure
|
|
209
|
+
if with_stability:
|
|
210
|
+
stability_values = np.array([stability(Z[i]) for i in range(n_alphas)])
|
|
211
|
+
normalizer, _ = integrate(stability_values, alphas, delta)
|
|
212
|
+
stability_values /= normalizer
|
|
213
|
+
else:
|
|
214
|
+
stability_values = np.ones(n_alphas)
|
|
215
|
+
|
|
216
|
+
# evaluate ipss bounds for specific functions
|
|
217
|
+
if m == 1:
|
|
218
|
+
integral, stop_index = integrate(stability_values * average_select**2 / p, alphas, delta, cutoff=cutoff)
|
|
219
|
+
elif m == 2:
|
|
220
|
+
term1 = average_select**2 / (p * B)
|
|
221
|
+
term2 = (B-1) * average_select**4 / (B * p**3)
|
|
222
|
+
integral, stop_index = integrate(stability_values * (term1 + term2), alphas, delta, cutoff=cutoff)
|
|
223
|
+
elif m == 3:
|
|
224
|
+
term1 = average_select**2 / (p * B**2)
|
|
225
|
+
term2 = (3 * (B-1) * average_select**4) / (p**3 * B**2)
|
|
226
|
+
term3 = ((B-1) * (B-2) * average_select**6) / (p**5 * B**2)
|
|
227
|
+
integral, stop_index = integrate(stability_values * (term1 + term2 + term3), alphas, delta, cutoff=cutoff)
|
|
228
|
+
else:
|
|
229
|
+
integral = cutoff
|
|
230
|
+
stop_index = len(alphas)
|
|
231
|
+
|
|
232
|
+
# compute ipss scores
|
|
233
|
+
alphas_stop = alphas[:stop_index]
|
|
234
|
+
scores = np.zeros(p)
|
|
235
|
+
for i in range(p):
|
|
236
|
+
values = np.empty(stop_index)
|
|
237
|
+
for j in range(stop_index):
|
|
238
|
+
values[j] = stability_values[j] * selection_function(stability_paths[j,i])
|
|
239
|
+
scores[i], _ = integrate(values, alphas_stop, delta)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
return stability_paths, scores, integral, alphas, stop_index
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
#--------------------------------
|
|
246
|
+
# Helpers
|
|
247
|
+
#--------------------------------
|
|
248
|
+
def compute_alphas(X, y, n_alphas, q_max, binary_response=False):
|
|
249
|
+
n, p = X.shape
|
|
250
|
+
|
|
251
|
+
if binary_response:
|
|
252
|
+
y_mean = np.mean(y)
|
|
253
|
+
scaled_residuals = y - y_mean * (1 - y_mean)
|
|
254
|
+
alpha_max = 5 / np.max(np.abs(np.dot(X.T, scaled_residuals) / n))
|
|
255
|
+
model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
|
|
256
|
+
# model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
|
|
257
|
+
else:
|
|
258
|
+
alpha_max = 2 * np.max(np.abs(np.dot(X.T,y))) / n
|
|
259
|
+
model = Lasso(warm_start=True)
|
|
260
|
+
|
|
261
|
+
alpha_min = alpha_max * 1e-10
|
|
262
|
+
test_alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), 100)
|
|
263
|
+
|
|
264
|
+
for i, alpha in enumerate(test_alphas):
|
|
265
|
+
if binary_response:
|
|
266
|
+
model.set_params(C=1/alpha)
|
|
267
|
+
else:
|
|
268
|
+
model.set_params(alpha=alpha)
|
|
269
|
+
with warnings.catch_warnings():
|
|
270
|
+
warnings.simplefilter('ignore')
|
|
271
|
+
model.fit(X,y)
|
|
272
|
+
num_selected = np.sum(model.coef_ != 0)
|
|
273
|
+
if num_selected >= q_max:
|
|
274
|
+
alpha_min = alpha
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), n_alphas)
|
|
278
|
+
|
|
279
|
+
return alphas
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def integrate(values, alphas, delta=1, cutoff=None):
|
|
283
|
+
|
|
284
|
+
n_alphas = len(alphas)
|
|
285
|
+
a = min(alphas)
|
|
286
|
+
b = max(alphas)
|
|
287
|
+
|
|
288
|
+
if delta == 1:
|
|
289
|
+
normalization = (1 - (a/b)**(1/n_alphas)) / np.log(b/a)
|
|
290
|
+
else:
|
|
291
|
+
normalization = (1 - delta) * (1 - (a/b)**(1/n_alphas)) / (b**(1-delta) - a**(1-delta))
|
|
292
|
+
|
|
293
|
+
output = 0
|
|
294
|
+
stop_index = n_alphas
|
|
295
|
+
before = stop_index
|
|
296
|
+
|
|
297
|
+
if cutoff is None:
|
|
298
|
+
for i in range(1,n_alphas):
|
|
299
|
+
weight = 1 if delta == 1 else alphas[i]**(1-delta)
|
|
300
|
+
output += normalization * weight * values[i-1]
|
|
301
|
+
|
|
302
|
+
else:
|
|
303
|
+
for i in range(1,n_alphas):
|
|
304
|
+
weight = 1 if delta == 1 else alphas[i]**(1-delta)
|
|
305
|
+
updated_output = output + normalization * weight * values[i-1]
|
|
306
|
+
if updated_output > cutoff:
|
|
307
|
+
stop_index = i
|
|
308
|
+
break
|
|
309
|
+
else:
|
|
310
|
+
output = updated_output
|
|
311
|
+
|
|
312
|
+
return output, stop_index
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def stability(z):
|
|
316
|
+
B, d = np.shape(z)
|
|
317
|
+
prob = np.mean(z,axis=0)
|
|
318
|
+
prob = np.squeeze(np.asarray(prob))
|
|
319
|
+
k_hat = np.mean(prob)
|
|
320
|
+
numerator = np.mean(prob * (1 - prob))
|
|
321
|
+
denominator = k_hat * (1 - k_hat)
|
|
322
|
+
if denominator > 1e-8:
|
|
323
|
+
frac = numerator/denominator
|
|
324
|
+
else:
|
|
325
|
+
frac = 1
|
|
326
|
+
|
|
327
|
+
return 1 - frac
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ipss
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Python implementation of Integrated Path Stability Selection (IPSS)
|
|
5
|
+
Author: Omar Melikechi
|
|
6
|
+
Author-email: omar.melikechi@gmail.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: scipy
|
|
14
|
+
Requires-Dist: scikit-learn
|
|
15
|
+
Requires-Dist: matplotlib
|
|
16
|
+
Requires-Dist: joblib
|
|
17
|
+
Requires-Dist: seaborn
|
|
18
|
+
|
|
19
|
+
# Integrated path stability selection (IPSS)
|
|
20
|
+
|
|
21
|
+
Integrated path stability selection (IPSS) is a general method for improving feature selection algorithms that yields
|
|
22
|
+
more robust, accurate, and interpretable models. IPSS does this by allowing users to control the expected number of
|
|
23
|
+
falsely selected features, E(FP), while producing far more true positives than other versions of stability selection.
|
|
24
|
+
This Python implementation of IPSS applied to L1-regularized linear and logistic regression is intended for researchers
|
|
25
|
+
and practitioners alike, requiring only the X and y data and specification of E(FP).
|
|
26
|
+
|
|
27
|
+
## Associated paper
|
|
28
|
+
|
|
29
|
+
arXiv:
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
### Dependencies
|
|
33
|
+
```
|
|
34
|
+
pip install joblib numpy scikit-learn scipy
|
|
35
|
+
```
|
|
36
|
+
### Installing IPSS
|
|
37
|
+
To install from PyPI:
|
|
38
|
+
```
|
|
39
|
+
pip install ipss
|
|
40
|
+
```
|
|
41
|
+
To clone from GitHub:
|
|
42
|
+
```
|
|
43
|
+
git clone git@github.com:omelikechi/ipss.git
|
|
44
|
+
```
|
|
45
|
+
Or clone from GitHub using HTTPS:
|
|
46
|
+
```
|
|
47
|
+
git clone https://github.com/omelikechi/ipss.git
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
Given an n-by-p matrix of features, X (n = number of samples, p = number of features), an n-by-1 vector of responses, y, and a target number of expected false positives, EFP:
|
|
52
|
+
```python
|
|
53
|
+
from ipss import ipss
|
|
54
|
+
|
|
55
|
+
# Load data X and y
|
|
56
|
+
# Specify expected number of false positives (EFP)
|
|
57
|
+
# Run IPSS:
|
|
58
|
+
result = ipss(X, y, EFP)
|
|
59
|
+
|
|
60
|
+
# Result analysis
|
|
61
|
+
print(result['selected_features']) # features selected by IPSS
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Results
|
|
65
|
+
`result` is a dictionary containing:
|
|
66
|
+
- `alphas`: Grid of regularization parameters (array of shape `(n_alphas,)`).
|
|
67
|
+
- `average_select`: Average number of features selected at each regularization (array of shape `(n_alphas,)`).
|
|
68
|
+
- `scores`: IPSS score for each feature (array of shape `(p,)`).
|
|
69
|
+
- `selected_features`: Indices of features selected by IPSS (list of ints).
|
|
70
|
+
- `stability_paths`: Estimated selection probabilities at each regularization (array of shape `(n_alphas, p)`)
|
|
71
|
+
- `stop_index`: Index of regularization value at which IPSS threshold is passed (int).
|
|
72
|
+
- `threshold`: The calculated threshold value tau = Integral value / EFP (scalar).
|
|
73
|
+
|
|
74
|
+
### Full ist of arguments
|
|
75
|
+
`ipss` takes the following arguments (only `X` and `y` are required, and typically only `EFP` is specified):
|
|
76
|
+
- `X`: Features (array of shape `(n,p)`).
|
|
77
|
+
- `y`: Responses (array of shape `(n,)` or `(n, 1)`). IPSS automatically detects if `y` is continuous or binary.
|
|
78
|
+
- `EFP`: Target expected number of false positives (positive scalar; default is `1`).
|
|
79
|
+
- `cutoff`: Together with `EFP`, determines IPSS threshold (positive scalar; default is `0.05`).
|
|
80
|
+
- `B`: Number of subsampling steps (int; default is `50`).
|
|
81
|
+
- `n_alphas`: Number of values in regularization grid (int; default is `25`).
|
|
82
|
+
- `q_max`: Max number of features selected (int; default is `None`, in which case `q_max = p/2`).
|
|
83
|
+
- `Z_sparse`: If `True`, tensor of subsamples, `Z`, is sparse (default is `False`).
|
|
84
|
+
- `lars`: Implements least angle regression (LARS) for linear regression if `True`, lasso otherwise (default is `False`).
|
|
85
|
+
- `selection_function`: Function to apply to the stability paths. If a positive int, `m`, function is `h_m(x) = (2x - 1)**m` if `x >= 0.5` and `0` if `x < 0.5` (int, callable, or `None`; default is `None`, in which case function is `h_2` if y is binary, or `h_3` if continuous).
|
|
86
|
+
- `with_stability`: If `True`, uses a stability measure in selection process (default is `False`).
|
|
87
|
+
- `delta`: Determines scaling of regularization interval (scalar; default is `1`).
|
|
88
|
+
- `standardize_X`: If `True`, standardizes all features (default is `True`).
|
|
89
|
+
- `center_y`: If `True`, centers `y` when it is continuous (default is `True`).
|
|
90
|
+
|
|
91
|
+
## Examples
|
|
92
|
+
Examples are available in the `examples` folder. These include
|
|
93
|
+
- A simple example in which features are simulated independently from a standard normal distribution.
|
|
94
|
+
- An example using prostate cancer data, as detailed in the associated paper.
|
|
95
|
+
- An example using colon cancer data, as detailed in the associated paper.
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
ipss/__init__.py,sha256=KCWzcd-7A4MYOz412le3nAdSFu0SQnWCJenuTZh5SCs,22
|
|
2
|
+
ipss/ipss.py,sha256=iiTVuzPJIs_CwQdiuwMupu3zPFKRSKc_qXd_aivNHwg,9341
|
|
3
|
+
src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
src/ipss.py,sha256=iiTVuzPJIs_CwQdiuwMupu3zPFKRSKc_qXd_aivNHwg,9341
|
|
5
|
+
ipss-0.3.0.dist-info/METADATA,sha256=7SZSiuHA2T2baPIBp25s5pbaOl4hOC6YoGkedARi_HY,4439
|
|
6
|
+
ipss-0.3.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
7
|
+
ipss-0.3.0.dist-info/top_level.txt,sha256=5MkzHNB1kaiW99M-cpQ4F7iPYaJmgF7_IyvcwOic9WI,5
|
|
8
|
+
ipss-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ipss
|
src/__init__.py
ADDED
|
File without changes
|
src/ipss.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
# Integrate path stability selection (IPSS)
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
from joblib import Parallel, delayed
|
|
6
|
+
import numpy as np
|
|
7
|
+
from scipy.sparse import csr_matrix
|
|
8
|
+
from sklearn.linear_model import lars_path, Lasso, lasso_path, LogisticRegression
|
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
#--------------------------------
|
|
13
|
+
# IPSS regression
|
|
14
|
+
#--------------------------------
|
|
15
|
+
'''
|
|
16
|
+
Inputs:
|
|
17
|
+
X: n-by-p data matrix (n = number of samples, p = number of features)
|
|
18
|
+
y: n-by-1 response vector (binary or continuous)
|
|
19
|
+
EFP: Target value for expected number of false positives
|
|
20
|
+
cutoff: Positive scalar C that, together with EFP, determines IPSS threshold
|
|
21
|
+
B: Number of subsampling steps
|
|
22
|
+
n_alphas: Number of values in grid of regularization parameters
|
|
23
|
+
q_max: Maximum number of features selected
|
|
24
|
+
Z_sparse: n_alphas-by-B-by-p tensor of subsamples, Z, is output as sparse if 'True'
|
|
25
|
+
lars: Uses least angle regression (LARS) for linear regression if 'True' or lasso if 'False'
|
|
26
|
+
selection_function: Function to apply to the estimated selection probabilities. If equal to
|
|
27
|
+
an integer, m, then function is h_m(x) = (2x - 1)**m if x >= 0.5 and 0 if x < 0.5
|
|
28
|
+
with_stability: Uses stability measure if 'True'
|
|
29
|
+
delta: Scalar value that determines scaling of regularization interval. delta = 1 corresponds
|
|
30
|
+
to log scale, delta = 0 corresponds to linear scale
|
|
31
|
+
'''
|
|
32
|
+
def ipss(X, y,
|
|
33
|
+
EFP=1,
|
|
34
|
+
cutoff=0.05,
|
|
35
|
+
B=50,
|
|
36
|
+
n_alphas=25,
|
|
37
|
+
q_max=None,
|
|
38
|
+
Z_sparse=False,
|
|
39
|
+
lars=False,
|
|
40
|
+
selection_function=None,
|
|
41
|
+
with_stability=False,
|
|
42
|
+
delta=1,
|
|
43
|
+
standardize_X=True,
|
|
44
|
+
center_y=True
|
|
45
|
+
):
|
|
46
|
+
|
|
47
|
+
if len(y.shape) != 1:
|
|
48
|
+
if y.shape[1] == 1:
|
|
49
|
+
y = y.ravel()
|
|
50
|
+
else:
|
|
51
|
+
raise ValueError("Error: Response y must be a numpy array with shape (n,) or (n,1)")
|
|
52
|
+
|
|
53
|
+
if standardize_X:
|
|
54
|
+
X = StandardScaler().fit_transform(X)
|
|
55
|
+
|
|
56
|
+
n, p = X.shape
|
|
57
|
+
n_split = int(n/2)
|
|
58
|
+
|
|
59
|
+
# check if response is binary
|
|
60
|
+
binary_response = (len(np.unique(y)) == 2)
|
|
61
|
+
|
|
62
|
+
# maximum number of features
|
|
63
|
+
if q_max is None:
|
|
64
|
+
q_max = p / 2
|
|
65
|
+
|
|
66
|
+
# compute alphas
|
|
67
|
+
alphas = compute_alphas(X, y, n_alphas, q_max, binary_response)
|
|
68
|
+
|
|
69
|
+
# linear regression
|
|
70
|
+
if not binary_response:
|
|
71
|
+
|
|
72
|
+
if center_y:
|
|
73
|
+
y -= np.mean(y)
|
|
74
|
+
|
|
75
|
+
if lars:
|
|
76
|
+
def process_b(b):
|
|
77
|
+
indices = np.arange(n)
|
|
78
|
+
np.random.shuffle(indices)
|
|
79
|
+
|
|
80
|
+
z = np.empty((n_alphas, 2, p))
|
|
81
|
+
|
|
82
|
+
for half in range(2):
|
|
83
|
+
idx = indices[:n_split] if half == 0 else indices[n_split:]
|
|
84
|
+
X_half, y_half = X[idx,:], y[idx]
|
|
85
|
+
|
|
86
|
+
with warnings.catch_warnings():
|
|
87
|
+
warnings.simplefilter('ignore')
|
|
88
|
+
lars_alphas, _, coefs = lars_path(X_half, y_half, method='lasso')
|
|
89
|
+
|
|
90
|
+
for i, alpha in enumerate(alphas):
|
|
91
|
+
idx_alpha = np.abs(lars_alphas - alpha).argmin()
|
|
92
|
+
coef = coefs[:, idx_alpha]
|
|
93
|
+
|
|
94
|
+
z[i, half, :] = (coef != 0).astype(int)
|
|
95
|
+
|
|
96
|
+
return z
|
|
97
|
+
|
|
98
|
+
else:
|
|
99
|
+
def process_b(b):
|
|
100
|
+
indices = np.arange(n)
|
|
101
|
+
np.random.shuffle(indices)
|
|
102
|
+
|
|
103
|
+
z = np.empty((n_alphas, 2, p))
|
|
104
|
+
|
|
105
|
+
for half in range(2):
|
|
106
|
+
idx = indices[:n_split] if half == 0 else indices[n_split:]
|
|
107
|
+
X_half, y_half = X[idx,:], y[idx]
|
|
108
|
+
|
|
109
|
+
with warnings.catch_warnings():
|
|
110
|
+
warnings.simplefilter('ignore')
|
|
111
|
+
_, coef, _ = lasso_path(X_half, y_half, alphas=alphas)
|
|
112
|
+
z[:, half, :] = (coef.T != 0).astype(int)
|
|
113
|
+
|
|
114
|
+
return z
|
|
115
|
+
|
|
116
|
+
# logistic regression
|
|
117
|
+
else:
|
|
118
|
+
def process_b(b):
|
|
119
|
+
indices = np.arange(n)
|
|
120
|
+
np.random.shuffle(indices)
|
|
121
|
+
|
|
122
|
+
z = np.empty((n_alphas, 2, p))
|
|
123
|
+
|
|
124
|
+
model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
|
|
125
|
+
# model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
|
|
126
|
+
|
|
127
|
+
for half in range(2):
|
|
128
|
+
idx = indices[:n_split] if half == 0 else indices[n_split:]
|
|
129
|
+
X_half, y_half = X[idx,:], y[idx]
|
|
130
|
+
|
|
131
|
+
for i, alpha in enumerate(alphas):
|
|
132
|
+
|
|
133
|
+
model.set_params(C=1/alpha)
|
|
134
|
+
with warnings.catch_warnings(record=True) as w:
|
|
135
|
+
warnings.simplefilter('ignore')
|
|
136
|
+
fit = model.fit(X_half, y_half.ravel())
|
|
137
|
+
z[i, half, :] = (fit.coef_ != 0).astype(int)
|
|
138
|
+
|
|
139
|
+
return z
|
|
140
|
+
|
|
141
|
+
# parallelize subsampling across multiple CPU cores
|
|
142
|
+
results = np.array(Parallel(n_jobs=-1)(delayed(process_b)(b) for b in range(B)))
|
|
143
|
+
|
|
144
|
+
# aggregate results
|
|
145
|
+
Z = np.zeros((n_alphas,2*B,p))
|
|
146
|
+
for b in range(B):
|
|
147
|
+
Z[:, 2*b:2*(b + 1), :] = results[b,:,:,:]
|
|
148
|
+
|
|
149
|
+
# stop at max features, q_max
|
|
150
|
+
stop_index = n_alphas
|
|
151
|
+
average_select = np.empty(n_alphas)
|
|
152
|
+
for i in range(n_alphas):
|
|
153
|
+
z = Z[i,:,:]
|
|
154
|
+
average_select[i] = np.mean(np.sum(z,axis=1))
|
|
155
|
+
if average_select[i] > q_max:
|
|
156
|
+
stop_index = i
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
Z = Z[:stop_index,:,:]
|
|
160
|
+
alphas = alphas[:stop_index]
|
|
161
|
+
average_select = average_select[:stop_index]
|
|
162
|
+
|
|
163
|
+
if Z_sparse:
|
|
164
|
+
Z_sparse = np.empty((stop_index,), dtype=object)
|
|
165
|
+
for i in range(stop_index):
|
|
166
|
+
Z_sparse[i] = csr_matrix(Z[i,:,:])
|
|
167
|
+
Z = Z_sparse
|
|
168
|
+
|
|
169
|
+
# ipss
|
|
170
|
+
if selection_function is None:
|
|
171
|
+
if binary_response:
|
|
172
|
+
selection_function = 2
|
|
173
|
+
else:
|
|
174
|
+
selection_function = 3
|
|
175
|
+
|
|
176
|
+
stability_paths, scores, integral, alphas, stop_index = ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff)
|
|
177
|
+
|
|
178
|
+
threshold = integral / EFP
|
|
179
|
+
selected_features = np.where(scores >= threshold)[0]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
return {'alphas':alphas, 'average_select':average_select, 'scores':scores, 'selected_features':selected_features,
|
|
183
|
+
'stability_paths':stability_paths, 'stop_index':stop_index, 'threshold':threshold}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
#--------------------------------
|
|
187
|
+
# IPSS scores
|
|
188
|
+
#--------------------------------
|
|
189
|
+
def ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff):
|
|
190
|
+
|
|
191
|
+
n_alphas = Z.shape[0]
|
|
192
|
+
B, p = Z[0].shape
|
|
193
|
+
B /= 2
|
|
194
|
+
|
|
195
|
+
# function
|
|
196
|
+
if isinstance(selection_function, (int, float)):
|
|
197
|
+
m = selection_function
|
|
198
|
+
def selection_function(x):
|
|
199
|
+
return 0 if x <= 0.5 else (2*x - 1)**m
|
|
200
|
+
else:
|
|
201
|
+
m = 'user_defined'
|
|
202
|
+
|
|
203
|
+
# stability paths
|
|
204
|
+
stability_paths = np.empty((n_alphas,p))
|
|
205
|
+
for i in range(n_alphas):
|
|
206
|
+
stability_paths[i] = Z[i].mean(axis=0)
|
|
207
|
+
|
|
208
|
+
# stability measure
|
|
209
|
+
if with_stability:
|
|
210
|
+
stability_values = np.array([stability(Z[i]) for i in range(n_alphas)])
|
|
211
|
+
normalizer, _ = integrate(stability_values, alphas, delta)
|
|
212
|
+
stability_values /= normalizer
|
|
213
|
+
else:
|
|
214
|
+
stability_values = np.ones(n_alphas)
|
|
215
|
+
|
|
216
|
+
# evaluate ipss bounds for specific functions
|
|
217
|
+
if m == 1:
|
|
218
|
+
integral, stop_index = integrate(stability_values * average_select**2 / p, alphas, delta, cutoff=cutoff)
|
|
219
|
+
elif m == 2:
|
|
220
|
+
term1 = average_select**2 / (p * B)
|
|
221
|
+
term2 = (B-1) * average_select**4 / (B * p**3)
|
|
222
|
+
integral, stop_index = integrate(stability_values * (term1 + term2), alphas, delta, cutoff=cutoff)
|
|
223
|
+
elif m == 3:
|
|
224
|
+
term1 = average_select**2 / (p * B**2)
|
|
225
|
+
term2 = (3 * (B-1) * average_select**4) / (p**3 * B**2)
|
|
226
|
+
term3 = ((B-1) * (B-2) * average_select**6) / (p**5 * B**2)
|
|
227
|
+
integral, stop_index = integrate(stability_values * (term1 + term2 + term3), alphas, delta, cutoff=cutoff)
|
|
228
|
+
else:
|
|
229
|
+
integral = cutoff
|
|
230
|
+
stop_index = len(alphas)
|
|
231
|
+
|
|
232
|
+
# compute ipss scores
|
|
233
|
+
alphas_stop = alphas[:stop_index]
|
|
234
|
+
scores = np.zeros(p)
|
|
235
|
+
for i in range(p):
|
|
236
|
+
values = np.empty(stop_index)
|
|
237
|
+
for j in range(stop_index):
|
|
238
|
+
values[j] = stability_values[j] * selection_function(stability_paths[j,i])
|
|
239
|
+
scores[i], _ = integrate(values, alphas_stop, delta)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
return stability_paths, scores, integral, alphas, stop_index
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
#--------------------------------
|
|
246
|
+
# Helpers
|
|
247
|
+
#--------------------------------
|
|
248
|
+
def compute_alphas(X, y, n_alphas, q_max, binary_response=False):
|
|
249
|
+
n, p = X.shape
|
|
250
|
+
|
|
251
|
+
if binary_response:
|
|
252
|
+
y_mean = np.mean(y)
|
|
253
|
+
scaled_residuals = y - y_mean * (1 - y_mean)
|
|
254
|
+
alpha_max = 5 / np.max(np.abs(np.dot(X.T, scaled_residuals) / n))
|
|
255
|
+
model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
|
|
256
|
+
# model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
|
|
257
|
+
else:
|
|
258
|
+
alpha_max = 2 * np.max(np.abs(np.dot(X.T,y))) / n
|
|
259
|
+
model = Lasso(warm_start=True)
|
|
260
|
+
|
|
261
|
+
alpha_min = alpha_max * 1e-10
|
|
262
|
+
test_alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), 100)
|
|
263
|
+
|
|
264
|
+
for i, alpha in enumerate(test_alphas):
|
|
265
|
+
if binary_response:
|
|
266
|
+
model.set_params(C=1/alpha)
|
|
267
|
+
else:
|
|
268
|
+
model.set_params(alpha=alpha)
|
|
269
|
+
with warnings.catch_warnings():
|
|
270
|
+
warnings.simplefilter('ignore')
|
|
271
|
+
model.fit(X,y)
|
|
272
|
+
num_selected = np.sum(model.coef_ != 0)
|
|
273
|
+
if num_selected >= q_max:
|
|
274
|
+
alpha_min = alpha
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), n_alphas)
|
|
278
|
+
|
|
279
|
+
return alphas
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def integrate(values, alphas, delta=1, cutoff=None):
|
|
283
|
+
|
|
284
|
+
n_alphas = len(alphas)
|
|
285
|
+
a = min(alphas)
|
|
286
|
+
b = max(alphas)
|
|
287
|
+
|
|
288
|
+
if delta == 1:
|
|
289
|
+
normalization = (1 - (a/b)**(1/n_alphas)) / np.log(b/a)
|
|
290
|
+
else:
|
|
291
|
+
normalization = (1 - delta) * (1 - (a/b)**(1/n_alphas)) / (b**(1-delta) - a**(1-delta))
|
|
292
|
+
|
|
293
|
+
output = 0
|
|
294
|
+
stop_index = n_alphas
|
|
295
|
+
before = stop_index
|
|
296
|
+
|
|
297
|
+
if cutoff is None:
|
|
298
|
+
for i in range(1,n_alphas):
|
|
299
|
+
weight = 1 if delta == 1 else alphas[i]**(1-delta)
|
|
300
|
+
output += normalization * weight * values[i-1]
|
|
301
|
+
|
|
302
|
+
else:
|
|
303
|
+
for i in range(1,n_alphas):
|
|
304
|
+
weight = 1 if delta == 1 else alphas[i]**(1-delta)
|
|
305
|
+
updated_output = output + normalization * weight * values[i-1]
|
|
306
|
+
if updated_output > cutoff:
|
|
307
|
+
stop_index = i
|
|
308
|
+
break
|
|
309
|
+
else:
|
|
310
|
+
output = updated_output
|
|
311
|
+
|
|
312
|
+
return output, stop_index
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def stability(z):
|
|
316
|
+
B, d = np.shape(z)
|
|
317
|
+
prob = np.mean(z,axis=0)
|
|
318
|
+
prob = np.squeeze(np.asarray(prob))
|
|
319
|
+
k_hat = np.mean(prob)
|
|
320
|
+
numerator = np.mean(prob * (1 - prob))
|
|
321
|
+
denominator = k_hat * (1 - k_hat)
|
|
322
|
+
if denominator > 1e-8:
|
|
323
|
+
frac = numerator/denominator
|
|
324
|
+
else:
|
|
325
|
+
frac = 1
|
|
326
|
+
|
|
327
|
+
return 1 - frac
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|