ipss 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ipss/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .ipss import ipss
ipss/ipss.py ADDED
@@ -0,0 +1,343 @@
1
+ # Integrate path stability selection (IPSS)
2
+
3
+ import warnings
4
+
5
+ from joblib import Parallel, delayed
6
+ import numpy as np
7
+ from scipy.sparse import csr_matrix
8
+ from sklearn.linear_model import lars_path, Lasso, lasso_path, LogisticRegression
9
+ from sklearn.preprocessing import StandardScaler
10
+
11
+
12
+ #--------------------------------
13
+ # IPSS regression
14
+ #--------------------------------
15
+ '''
16
+ Inputs:
17
+ X: n-by-p data matrix (n = number of samples, p = number of features)
18
+ y: n-by-1 response vector (binary or continuous)
19
+ EFP: Target value for expected number of false positives
20
+ cutoff: Positive scalar C that, together with EFP, determines IPSS threshold
21
+ B: Number of subsampling steps
22
+ n_alphas: Number of values in grid of regularization parameters
23
+ q_max: Maximum number of features selected
24
+ Z_sparse: n_alphas-by-B-by-p tensor of subsamples, Z, is output as sparse if 'True'
25
+ lars: Uses least angle regression (LARS) for linear regression if 'True' or lasso if 'False'
26
+ selection_function: Function to apply to the estimated selection probabilities. If equal to
27
+ an integer, m, then function is h_m(x) = (2x - 1)**m if x >= 0.5 and 0 if x < 0.5
28
+ with_stability: Uses stability measure if 'True'
29
+ delta: Scalar value that determines scaling of regularization interval. delta = 1 corresponds
30
+ to log scale, delta = 0 corresponds to linear scale
31
+ '''
32
+ def ipss(X, y,
33
+ EFP=1,
34
+ cutoff=0.05,
35
+ B=50,
36
+ n_alphas=25,
37
+ q_max=None,
38
+ Z_sparse=False,
39
+ lars=False,
40
+ selection_function=None,
41
+ with_stability=False,
42
+ delta=1,
43
+ standardize_X=True,
44
+ center_y=True
45
+ ):
46
+
47
+ if len(y.shape) != 1:
48
+ if y.shape[1] == 1:
49
+ y = y.ravel()
50
+ else:
51
+ raise ValueError("Error: Response y must be a numpy array with shape (n,) or (n,1)")
52
+
53
+ if standardize_X:
54
+ X = StandardScaler().fit_transform(X)
55
+
56
+ n, p = X.shape
57
+ n_split = int(n/2)
58
+
59
+ # check if response is binary
60
+ binary_response = (len(np.unique(y)) == 2)
61
+
62
+ # maximum number of features
63
+ if q_max is None:
64
+ q_max = p / 2
65
+
66
+ # compute alphas
67
+ alphas = compute_alphas(X, y, n_alphas, q_max, binary_response)
68
+
69
+ # linear regression
70
+ if not binary_response:
71
+
72
+ if center_y:
73
+ y -= np.mean(y)
74
+
75
+ if lars:
76
+ def process_b(b):
77
+ indices = np.arange(n)
78
+ np.random.shuffle(indices)
79
+
80
+ z = np.empty((n_alphas, 2, p))
81
+
82
+ for half in range(2):
83
+ idx = indices[:n_split] if half == 0 else indices[n_split:]
84
+ X_half, y_half = X[idx,:], y[idx]
85
+
86
+ with warnings.catch_warnings():
87
+ warnings.simplefilter('ignore')
88
+ lars_alphas, _, coefs = lars_path(X_half, y_half, method='lasso')
89
+
90
+ for i, alpha in enumerate(alphas):
91
+ idx_alpha = np.abs(lars_alphas - alpha).argmin()
92
+ coef = coefs[:, idx_alpha]
93
+
94
+ z[i, half, :] = (coef != 0).astype(int)
95
+
96
+ return z
97
+
98
+ else:
99
+ def process_b(b):
100
+ indices = np.arange(n)
101
+ np.random.shuffle(indices)
102
+
103
+ z = np.empty((n_alphas, 2, p))
104
+
105
+ for half in range(2):
106
+ idx = indices[:n_split] if half == 0 else indices[n_split:]
107
+ X_half, y_half = X[idx,:], y[idx]
108
+
109
+ with warnings.catch_warnings():
110
+ warnings.simplefilter('ignore')
111
+ _, coef, _ = lasso_path(X_half, y_half, alphas=alphas)
112
+ z[:, half, :] = (coef.T != 0).astype(int)
113
+
114
+ return z
115
+
116
+ # logistic regression
117
+ else:
118
+ def process_b(b):
119
+ indices = np.arange(n)
120
+ np.random.shuffle(indices)
121
+
122
+ z = np.empty((n_alphas, 2, p))
123
+
124
+ model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
125
+ # model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
126
+
127
+ for half in range(2):
128
+ idx = indices[:n_split] if half == 0 else indices[n_split:]
129
+ X_half, y_half = X[idx,:], y[idx]
130
+
131
+ for i, alpha in enumerate(alphas):
132
+
133
+ model.set_params(C=1/alpha)
134
+ with warnings.catch_warnings(record=True) as w:
135
+ warnings.simplefilter('ignore')
136
+ fit = model.fit(X_half, y_half.ravel())
137
+ z[i, half, :] = (fit.coef_ != 0).astype(int)
138
+
139
+ return z
140
+
141
+ # parallelize subsampling across multiple CPU cores
142
+ results = np.array(Parallel(n_jobs=-1)(delayed(process_b)(b) for b in range(B)))
143
+
144
+ # aggregate results
145
+ Z = np.zeros((n_alphas,2*B,p))
146
+ for b in range(B):
147
+ Z[:, 2*b:2*(b + 1), :] = results[b,:,:,:]
148
+
149
+ # stop at max features, q_max
150
+ stop_index = n_alphas
151
+ average_select = np.empty(n_alphas)
152
+ for i in range(n_alphas):
153
+ z = Z[i,:,:]
154
+ average_select[i] = np.mean(np.sum(z,axis=1))
155
+ if average_select[i] > q_max:
156
+ stop_index = i
157
+ break
158
+
159
+ Z = Z[:stop_index,:,:]
160
+ alphas = alphas[:stop_index]
161
+ average_select = average_select[:stop_index]
162
+
163
+ if Z_sparse:
164
+ Z_sparse = np.empty((stop_index,), dtype=object)
165
+ for i in range(stop_index):
166
+ Z_sparse[i] = csr_matrix(Z[i,:,:])
167
+ Z = Z_sparse
168
+
169
+ # ipss
170
+ if selection_function is None:
171
+ if binary_response:
172
+ selection_function = 2
173
+ else:
174
+ selection_function = 3
175
+
176
+ stability_paths, scores, integral, alphas, stop_index = ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff)
177
+
178
+ threshold = integral / EFP
179
+ selected_features = np.where(scores >= threshold)[0]
180
+
181
+
182
+ return {'alphas':alphas, 'average_select':average_select, 'scores':scores, 'selected_features':selected_features,
183
+ 'stability_paths':stability_paths, 'stop_index':stop_index, 'threshold':threshold}
184
+
185
+
186
+ #--------------------------------
187
+ # IPSS scores
188
+ #--------------------------------
189
+ def ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff):
190
+
191
+ n_alphas = Z.shape[0]
192
+ B, p = Z[0].shape
193
+ B /= 2
194
+
195
+ # function
196
+ if isinstance(selection_function, (int, float)):
197
+ m = selection_function
198
+ def selection_function(x):
199
+ return 0 if x <= 0.5 else (2*x - 1)**m
200
+ else:
201
+ m = 'user_defined'
202
+
203
+ # stability paths
204
+ stability_paths = np.empty((n_alphas,p))
205
+ for i in range(n_alphas):
206
+ stability_paths[i] = Z[i].mean(axis=0)
207
+
208
+ # stability measure
209
+ if with_stability:
210
+ stability_values = np.array([stability(Z[i]) for i in range(n_alphas)])
211
+ normalizer, _ = integrate(stability_values, alphas, delta)
212
+ stability_values /= normalizer
213
+ else:
214
+ stability_values = np.ones(n_alphas)
215
+
216
+ # evaluate ipss bounds for specific functions
217
+ if m == 1:
218
+ integral, stop_index = integrate(stability_values * average_select**2 / p, alphas, delta, cutoff=cutoff)
219
+ elif m == 2:
220
+ term1 = average_select**2 / (p * B)
221
+ term2 = (B-1) * average_select**4 / (B * p**3)
222
+ integral, stop_index = integrate(stability_values * (term1 + term2), alphas, delta, cutoff=cutoff)
223
+ elif m == 3:
224
+ term1 = average_select**2 / (p * B**2)
225
+ term2 = (3 * (B-1) * average_select**4) / (p**3 * B**2)
226
+ term3 = ((B-1) * (B-2) * average_select**6) / (p**5 * B**2)
227
+ integral, stop_index = integrate(stability_values * (term1 + term2 + term3), alphas, delta, cutoff=cutoff)
228
+ else:
229
+ integral = cutoff
230
+ stop_index = len(alphas)
231
+
232
+ # compute ipss scores
233
+ alphas_stop = alphas[:stop_index]
234
+ scores = np.zeros(p)
235
+ for i in range(p):
236
+ values = np.empty(stop_index)
237
+ for j in range(stop_index):
238
+ values[j] = stability_values[j] * selection_function(stability_paths[j,i])
239
+ scores[i], _ = integrate(values, alphas_stop, delta)
240
+
241
+
242
+ return stability_paths, scores, integral, alphas, stop_index
243
+
244
+
245
+ #--------------------------------
246
+ # Helpers
247
+ #--------------------------------
248
+ def compute_alphas(X, y, n_alphas, q_max, binary_response=False):
249
+ n, p = X.shape
250
+
251
+ if binary_response:
252
+ y_mean = np.mean(y)
253
+ scaled_residuals = y - y_mean * (1 - y_mean)
254
+ alpha_max = 5 / np.max(np.abs(np.dot(X.T, scaled_residuals) / n))
255
+ model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
256
+ # model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
257
+ else:
258
+ alpha_max = 2 * np.max(np.abs(np.dot(X.T,y))) / n
259
+ model = Lasso(warm_start=True)
260
+
261
+ alpha_min = alpha_max * 1e-10
262
+ test_alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), 100)
263
+
264
+ for i, alpha in enumerate(test_alphas):
265
+ if binary_response:
266
+ model.set_params(C=1/alpha)
267
+ else:
268
+ model.set_params(alpha=alpha)
269
+ with warnings.catch_warnings():
270
+ warnings.simplefilter('ignore')
271
+ model.fit(X,y)
272
+ num_selected = np.sum(model.coef_ != 0)
273
+ if num_selected >= q_max:
274
+ alpha_min = alpha
275
+ break
276
+
277
+ alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), n_alphas)
278
+
279
+ return alphas
280
+
281
+
282
+ def integrate(values, alphas, delta=1, cutoff=None):
283
+
284
+ n_alphas = len(alphas)
285
+ a = min(alphas)
286
+ b = max(alphas)
287
+
288
+ if delta == 1:
289
+ normalization = (1 - (a/b)**(1/n_alphas)) / np.log(b/a)
290
+ else:
291
+ normalization = (1 - delta) * (1 - (a/b)**(1/n_alphas)) / (b**(1-delta) - a**(1-delta))
292
+
293
+ output = 0
294
+ stop_index = n_alphas
295
+ before = stop_index
296
+
297
+ if cutoff is None:
298
+ for i in range(1,n_alphas):
299
+ weight = 1 if delta == 1 else alphas[i]**(1-delta)
300
+ output += normalization * weight * values[i-1]
301
+
302
+ else:
303
+ for i in range(1,n_alphas):
304
+ weight = 1 if delta == 1 else alphas[i]**(1-delta)
305
+ updated_output = output + normalization * weight * values[i-1]
306
+ if updated_output > cutoff:
307
+ stop_index = i
308
+ break
309
+ else:
310
+ output = updated_output
311
+
312
+ return output, stop_index
313
+
314
+
315
+ def stability(z):
316
+ B, d = np.shape(z)
317
+ prob = np.mean(z,axis=0)
318
+ prob = np.squeeze(np.asarray(prob))
319
+ k_hat = np.mean(prob)
320
+ numerator = np.mean(prob * (1 - prob))
321
+ denominator = k_hat * (1 - k_hat)
322
+ if denominator > 1e-8:
323
+ frac = numerator/denominator
324
+ else:
325
+ frac = 1
326
+
327
+ return 1 - frac
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
@@ -0,0 +1,103 @@
1
+ Metadata-Version: 2.1
2
+ Name: ipss
3
+ Version: 0.3.0
4
+ Summary: Python implementation of Integrated Path Stability Selection (IPSS)
5
+ Author: Omar Melikechi
6
+ Author-email: omar.melikechi@gmail.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.6
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: numpy
13
+ Requires-Dist: scipy
14
+ Requires-Dist: scikit-learn
15
+ Requires-Dist: matplotlib
16
+ Requires-Dist: joblib
17
+ Requires-Dist: seaborn
18
+
19
+ # Integrated path stability selection (IPSS)
20
+
21
+ Integrated path stability selection (IPSS) is a general method for improving feature selection algorithms that yields
22
+ more robust, accurate, and interpretable models. IPSS does this by allowing users to control the expected number of
23
+ falsely selected features, E(FP), while producing far more true positives than other versions of stability selection.
24
+ This Python implementation of IPSS applied to L1-regularized linear and logistic regression is intended for researchers
25
+ and practitioners alike, requiring only the X and y data and specification of E(FP).
26
+
27
+ ## Associated paper
28
+
29
+ arXiv:
30
+
31
+ ## Installation
32
+ ### Dependencies
33
+ ```
34
+ pip install joblib numpy scikit-learn scipy
35
+ ```
36
+ ### Installing IPSS
37
+ To install from PyPI:
38
+ ```
39
+ pip install ipss
40
+ ```
41
+ To clone from GitHub:
42
+ ```
43
+ git clone git@github.com:omelikechi/ipss.git
44
+ ```
45
+ Or clone from GitHub using HTTPS:
46
+ ```
47
+ git clone https://github.com/omelikechi/ipss.git
48
+ ```
49
+
50
+ ## Usage
51
+ Given an n-by-p matrix of features, X (n = number of samples, p = number of features), an n-by-1 vector of responses, y, and a target number of expected false positives, EFP:
52
+ ```python
53
+ from ipss import ipss
54
+
55
+ # Load data X and y
56
+ # Specify expected number of false positives (EFP)
57
+ # Run IPSS:
58
+ result = ipss(X, y, EFP)
59
+
60
+ # Result analysis
61
+ print(result['selected_features']) # features selected by IPSS
62
+ ```
63
+
64
+ ### Results
65
+ `result` is a dictionary containing:
66
+ - `alphas`: Grid of regularization parameters (array of shape `(n_alphas,)`).
67
+ - `average_select`: Average number of features selected at each regularization (array of shape `(n_alphas,)`).
68
+ - `scores`: IPSS score for each feature (array of shape `(p,)`).
69
+ - `selected_features`: Indices of features selected by IPSS (list of ints).
70
+ - `stability_paths`: Estimated selection probabilities at each regularization (array of shape `(n_alphas, p)`)
71
+ - `stop_index`: Index of regularization value at which IPSS threshold is passed (int).
72
+ - `threshold`: The calculated threshold value tau = Integral value / EFP (scalar).
73
+
74
+ ### Full ist of arguments
75
+ `ipss` takes the following arguments (only `X` and `y` are required, and typically only `EFP` is specified):
76
+ - `X`: Features (array of shape `(n,p)`).
77
+ - `y`: Responses (array of shape `(n,)` or `(n, 1)`). IPSS automatically detects if `y` is continuous or binary.
78
+ - `EFP`: Target expected number of false positives (positive scalar; default is `1`).
79
+ - `cutoff`: Together with `EFP`, determines IPSS threshold (positive scalar; default is `0.05`).
80
+ - `B`: Number of subsampling steps (int; default is `50`).
81
+ - `n_alphas`: Number of values in regularization grid (int; default is `25`).
82
+ - `q_max`: Max number of features selected (int; default is `None`, in which case `q_max = p/2`).
83
+ - `Z_sparse`: If `True`, tensor of subsamples, `Z`, is sparse (default is `False`).
84
+ - `lars`: Implements least angle regression (LARS) for linear regression if `True`, lasso otherwise (default is `False`).
85
+ - `selection_function`: Function to apply to the stability paths. If a positive int, `m`, function is `h_m(x) = (2x - 1)**m` if `x >= 0.5` and `0` if `x < 0.5` (int, callable, or `None`; default is `None`, in which case function is `h_2` if y is binary, or `h_3` if continuous).
86
+ - `with_stability`: If `True`, uses a stability measure in selection process (default is `False`).
87
+ - `delta`: Determines scaling of regularization interval (scalar; default is `1`).
88
+ - `standardize_X`: If `True`, standardizes all features (default is `True`).
89
+ - `center_y`: If `True`, centers `y` when it is continuous (default is `True`).
90
+
91
+ ## Examples
92
+ Examples are available in the `examples` folder. These include
93
+ - A simple example in which features are simulated independently from a standard normal distribution.
94
+ - An example using prostate cancer data, as detailed in the associated paper.
95
+ - An example using colon cancer data, as detailed in the associated paper.
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
@@ -0,0 +1,8 @@
1
+ ipss/__init__.py,sha256=KCWzcd-7A4MYOz412le3nAdSFu0SQnWCJenuTZh5SCs,22
2
+ ipss/ipss.py,sha256=iiTVuzPJIs_CwQdiuwMupu3zPFKRSKc_qXd_aivNHwg,9341
3
+ src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ src/ipss.py,sha256=iiTVuzPJIs_CwQdiuwMupu3zPFKRSKc_qXd_aivNHwg,9341
5
+ ipss-0.3.0.dist-info/METADATA,sha256=7SZSiuHA2T2baPIBp25s5pbaOl4hOC6YoGkedARi_HY,4439
6
+ ipss-0.3.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
7
+ ipss-0.3.0.dist-info/top_level.txt,sha256=5MkzHNB1kaiW99M-cpQ4F7iPYaJmgF7_IyvcwOic9WI,5
8
+ ipss-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ ipss
src/__init__.py ADDED
File without changes
src/ipss.py ADDED
@@ -0,0 +1,343 @@
1
+ # Integrate path stability selection (IPSS)
2
+
3
+ import warnings
4
+
5
+ from joblib import Parallel, delayed
6
+ import numpy as np
7
+ from scipy.sparse import csr_matrix
8
+ from sklearn.linear_model import lars_path, Lasso, lasso_path, LogisticRegression
9
+ from sklearn.preprocessing import StandardScaler
10
+
11
+
12
+ #--------------------------------
13
+ # IPSS regression
14
+ #--------------------------------
15
+ '''
16
+ Inputs:
17
+ X: n-by-p data matrix (n = number of samples, p = number of features)
18
+ y: n-by-1 response vector (binary or continuous)
19
+ EFP: Target value for expected number of false positives
20
+ cutoff: Positive scalar C that, together with EFP, determines IPSS threshold
21
+ B: Number of subsampling steps
22
+ n_alphas: Number of values in grid of regularization parameters
23
+ q_max: Maximum number of features selected
24
+ Z_sparse: n_alphas-by-B-by-p tensor of subsamples, Z, is output as sparse if 'True'
25
+ lars: Uses least angle regression (LARS) for linear regression if 'True' or lasso if 'False'
26
+ selection_function: Function to apply to the estimated selection probabilities. If equal to
27
+ an integer, m, then function is h_m(x) = (2x - 1)**m if x >= 0.5 and 0 if x < 0.5
28
+ with_stability: Uses stability measure if 'True'
29
+ delta: Scalar value that determines scaling of regularization interval. delta = 1 corresponds
30
+ to log scale, delta = 0 corresponds to linear scale
31
+ '''
32
+ def ipss(X, y,
33
+ EFP=1,
34
+ cutoff=0.05,
35
+ B=50,
36
+ n_alphas=25,
37
+ q_max=None,
38
+ Z_sparse=False,
39
+ lars=False,
40
+ selection_function=None,
41
+ with_stability=False,
42
+ delta=1,
43
+ standardize_X=True,
44
+ center_y=True
45
+ ):
46
+
47
+ if len(y.shape) != 1:
48
+ if y.shape[1] == 1:
49
+ y = y.ravel()
50
+ else:
51
+ raise ValueError("Error: Response y must be a numpy array with shape (n,) or (n,1)")
52
+
53
+ if standardize_X:
54
+ X = StandardScaler().fit_transform(X)
55
+
56
+ n, p = X.shape
57
+ n_split = int(n/2)
58
+
59
+ # check if response is binary
60
+ binary_response = (len(np.unique(y)) == 2)
61
+
62
+ # maximum number of features
63
+ if q_max is None:
64
+ q_max = p / 2
65
+
66
+ # compute alphas
67
+ alphas = compute_alphas(X, y, n_alphas, q_max, binary_response)
68
+
69
+ # linear regression
70
+ if not binary_response:
71
+
72
+ if center_y:
73
+ y -= np.mean(y)
74
+
75
+ if lars:
76
+ def process_b(b):
77
+ indices = np.arange(n)
78
+ np.random.shuffle(indices)
79
+
80
+ z = np.empty((n_alphas, 2, p))
81
+
82
+ for half in range(2):
83
+ idx = indices[:n_split] if half == 0 else indices[n_split:]
84
+ X_half, y_half = X[idx,:], y[idx]
85
+
86
+ with warnings.catch_warnings():
87
+ warnings.simplefilter('ignore')
88
+ lars_alphas, _, coefs = lars_path(X_half, y_half, method='lasso')
89
+
90
+ for i, alpha in enumerate(alphas):
91
+ idx_alpha = np.abs(lars_alphas - alpha).argmin()
92
+ coef = coefs[:, idx_alpha]
93
+
94
+ z[i, half, :] = (coef != 0).astype(int)
95
+
96
+ return z
97
+
98
+ else:
99
+ def process_b(b):
100
+ indices = np.arange(n)
101
+ np.random.shuffle(indices)
102
+
103
+ z = np.empty((n_alphas, 2, p))
104
+
105
+ for half in range(2):
106
+ idx = indices[:n_split] if half == 0 else indices[n_split:]
107
+ X_half, y_half = X[idx,:], y[idx]
108
+
109
+ with warnings.catch_warnings():
110
+ warnings.simplefilter('ignore')
111
+ _, coef, _ = lasso_path(X_half, y_half, alphas=alphas)
112
+ z[:, half, :] = (coef.T != 0).astype(int)
113
+
114
+ return z
115
+
116
+ # logistic regression
117
+ else:
118
+ def process_b(b):
119
+ indices = np.arange(n)
120
+ np.random.shuffle(indices)
121
+
122
+ z = np.empty((n_alphas, 2, p))
123
+
124
+ model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
125
+ # model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
126
+
127
+ for half in range(2):
128
+ idx = indices[:n_split] if half == 0 else indices[n_split:]
129
+ X_half, y_half = X[idx,:], y[idx]
130
+
131
+ for i, alpha in enumerate(alphas):
132
+
133
+ model.set_params(C=1/alpha)
134
+ with warnings.catch_warnings(record=True) as w:
135
+ warnings.simplefilter('ignore')
136
+ fit = model.fit(X_half, y_half.ravel())
137
+ z[i, half, :] = (fit.coef_ != 0).astype(int)
138
+
139
+ return z
140
+
141
+ # parallelize subsampling across multiple CPU cores
142
+ results = np.array(Parallel(n_jobs=-1)(delayed(process_b)(b) for b in range(B)))
143
+
144
+ # aggregate results
145
+ Z = np.zeros((n_alphas,2*B,p))
146
+ for b in range(B):
147
+ Z[:, 2*b:2*(b + 1), :] = results[b,:,:,:]
148
+
149
+ # stop at max features, q_max
150
+ stop_index = n_alphas
151
+ average_select = np.empty(n_alphas)
152
+ for i in range(n_alphas):
153
+ z = Z[i,:,:]
154
+ average_select[i] = np.mean(np.sum(z,axis=1))
155
+ if average_select[i] > q_max:
156
+ stop_index = i
157
+ break
158
+
159
+ Z = Z[:stop_index,:,:]
160
+ alphas = alphas[:stop_index]
161
+ average_select = average_select[:stop_index]
162
+
163
+ if Z_sparse:
164
+ Z_sparse = np.empty((stop_index,), dtype=object)
165
+ for i in range(stop_index):
166
+ Z_sparse[i] = csr_matrix(Z[i,:,:])
167
+ Z = Z_sparse
168
+
169
+ # ipss
170
+ if selection_function is None:
171
+ if binary_response:
172
+ selection_function = 2
173
+ else:
174
+ selection_function = 3
175
+
176
+ stability_paths, scores, integral, alphas, stop_index = ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff)
177
+
178
+ threshold = integral / EFP
179
+ selected_features = np.where(scores >= threshold)[0]
180
+
181
+
182
+ return {'alphas':alphas, 'average_select':average_select, 'scores':scores, 'selected_features':selected_features,
183
+ 'stability_paths':stability_paths, 'stop_index':stop_index, 'threshold':threshold}
184
+
185
+
186
+ #--------------------------------
187
+ # IPSS scores
188
+ #--------------------------------
189
+ def ipss_results(Z, alphas, average_select, selection_function, with_stability, delta, cutoff):
190
+
191
+ n_alphas = Z.shape[0]
192
+ B, p = Z[0].shape
193
+ B /= 2
194
+
195
+ # function
196
+ if isinstance(selection_function, (int, float)):
197
+ m = selection_function
198
+ def selection_function(x):
199
+ return 0 if x <= 0.5 else (2*x - 1)**m
200
+ else:
201
+ m = 'user_defined'
202
+
203
+ # stability paths
204
+ stability_paths = np.empty((n_alphas,p))
205
+ for i in range(n_alphas):
206
+ stability_paths[i] = Z[i].mean(axis=0)
207
+
208
+ # stability measure
209
+ if with_stability:
210
+ stability_values = np.array([stability(Z[i]) for i in range(n_alphas)])
211
+ normalizer, _ = integrate(stability_values, alphas, delta)
212
+ stability_values /= normalizer
213
+ else:
214
+ stability_values = np.ones(n_alphas)
215
+
216
+ # evaluate ipss bounds for specific functions
217
+ if m == 1:
218
+ integral, stop_index = integrate(stability_values * average_select**2 / p, alphas, delta, cutoff=cutoff)
219
+ elif m == 2:
220
+ term1 = average_select**2 / (p * B)
221
+ term2 = (B-1) * average_select**4 / (B * p**3)
222
+ integral, stop_index = integrate(stability_values * (term1 + term2), alphas, delta, cutoff=cutoff)
223
+ elif m == 3:
224
+ term1 = average_select**2 / (p * B**2)
225
+ term2 = (3 * (B-1) * average_select**4) / (p**3 * B**2)
226
+ term3 = ((B-1) * (B-2) * average_select**6) / (p**5 * B**2)
227
+ integral, stop_index = integrate(stability_values * (term1 + term2 + term3), alphas, delta, cutoff=cutoff)
228
+ else:
229
+ integral = cutoff
230
+ stop_index = len(alphas)
231
+
232
+ # compute ipss scores
233
+ alphas_stop = alphas[:stop_index]
234
+ scores = np.zeros(p)
235
+ for i in range(p):
236
+ values = np.empty(stop_index)
237
+ for j in range(stop_index):
238
+ values[j] = stability_values[j] * selection_function(stability_paths[j,i])
239
+ scores[i], _ = integrate(values, alphas_stop, delta)
240
+
241
+
242
+ return stability_paths, scores, integral, alphas, stop_index
243
+
244
+
245
+ #--------------------------------
246
+ # Helpers
247
+ #--------------------------------
248
+ def compute_alphas(X, y, n_alphas, q_max, binary_response=False):
249
+ n, p = X.shape
250
+
251
+ if binary_response:
252
+ y_mean = np.mean(y)
253
+ scaled_residuals = y - y_mean * (1 - y_mean)
254
+ alpha_max = 5 / np.max(np.abs(np.dot(X.T, scaled_residuals) / n))
255
+ model = LogisticRegression(penalty='l1', solver='saga', tol=1e-3, warm_start=True, class_weight='balanced')
256
+ # model = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')
257
+ else:
258
+ alpha_max = 2 * np.max(np.abs(np.dot(X.T,y))) / n
259
+ model = Lasso(warm_start=True)
260
+
261
+ alpha_min = alpha_max * 1e-10
262
+ test_alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), 100)
263
+
264
+ for i, alpha in enumerate(test_alphas):
265
+ if binary_response:
266
+ model.set_params(C=1/alpha)
267
+ else:
268
+ model.set_params(alpha=alpha)
269
+ with warnings.catch_warnings():
270
+ warnings.simplefilter('ignore')
271
+ model.fit(X,y)
272
+ num_selected = np.sum(model.coef_ != 0)
273
+ if num_selected >= q_max:
274
+ alpha_min = alpha
275
+ break
276
+
277
+ alphas = np.logspace(np.log10(alpha_max), np.log10(alpha_min), n_alphas)
278
+
279
+ return alphas
280
+
281
+
282
+ def integrate(values, alphas, delta=1, cutoff=None):
283
+
284
+ n_alphas = len(alphas)
285
+ a = min(alphas)
286
+ b = max(alphas)
287
+
288
+ if delta == 1:
289
+ normalization = (1 - (a/b)**(1/n_alphas)) / np.log(b/a)
290
+ else:
291
+ normalization = (1 - delta) * (1 - (a/b)**(1/n_alphas)) / (b**(1-delta) - a**(1-delta))
292
+
293
+ output = 0
294
+ stop_index = n_alphas
295
+ before = stop_index
296
+
297
+ if cutoff is None:
298
+ for i in range(1,n_alphas):
299
+ weight = 1 if delta == 1 else alphas[i]**(1-delta)
300
+ output += normalization * weight * values[i-1]
301
+
302
+ else:
303
+ for i in range(1,n_alphas):
304
+ weight = 1 if delta == 1 else alphas[i]**(1-delta)
305
+ updated_output = output + normalization * weight * values[i-1]
306
+ if updated_output > cutoff:
307
+ stop_index = i
308
+ break
309
+ else:
310
+ output = updated_output
311
+
312
+ return output, stop_index
313
+
314
+
315
+ def stability(z):
316
+ B, d = np.shape(z)
317
+ prob = np.mean(z,axis=0)
318
+ prob = np.squeeze(np.asarray(prob))
319
+ k_hat = np.mean(prob)
320
+ numerator = np.mean(prob * (1 - prob))
321
+ denominator = k_hat * (1 - k_hat)
322
+ if denominator > 1e-8:
323
+ frac = numerator/denominator
324
+ else:
325
+ frac = 1
326
+
327
+ return 1 - frac
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+