py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/batman.py +32 -1
- py2ls/bio.py +3 -17
- py2ls/data/usages_sns.json +2 -1
- py2ls/ips.py +1694 -838
- py2ls/ml2ls.py +1877 -391
- py2ls/plot.py +500 -222
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/RECORD +10 -10
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/WHEEL +1 -1
py2ls/ml2ls.py
CHANGED
@@ -1,33 +1,59 @@
|
|
1
|
-
from sklearn.ensemble import
|
2
|
-
|
1
|
+
from sklearn.ensemble import (
|
2
|
+
RandomForestClassifier,
|
3
|
+
GradientBoostingClassifier,
|
4
|
+
AdaBoostClassifier,
|
5
|
+
BaggingClassifier,
|
6
|
+
)
|
7
|
+
from sklearn.svm import SVC,SVR
|
3
8
|
from sklearn.calibration import CalibratedClassifierCV
|
4
|
-
from sklearn.model_selection import GridSearchCV,StratifiedKFold
|
5
|
-
from sklearn.linear_model import
|
9
|
+
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
10
|
+
from sklearn.linear_model import (
|
11
|
+
LassoCV,
|
12
|
+
LogisticRegression,LinearRegression,
|
13
|
+
Lasso,
|
14
|
+
Ridge,
|
15
|
+
RidgeClassifierCV,
|
16
|
+
ElasticNet,
|
17
|
+
)
|
6
18
|
from sklearn.feature_selection import RFE
|
7
19
|
from sklearn.naive_bayes import GaussianNB
|
8
20
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
9
21
|
import xgboost as xgb # Make sure you have xgboost installed
|
10
22
|
|
11
23
|
from sklearn.model_selection import train_test_split, cross_val_score
|
12
|
-
from sklearn.metrics import (
|
13
|
-
|
14
|
-
|
15
|
-
|
24
|
+
from sklearn.metrics import (
|
25
|
+
accuracy_score,
|
26
|
+
precision_score,
|
27
|
+
recall_score,
|
28
|
+
f1_score,
|
29
|
+
roc_auc_score,
|
30
|
+
confusion_matrix,
|
31
|
+
matthews_corrcoef,
|
32
|
+
roc_curve,
|
33
|
+
auc,
|
34
|
+
balanced_accuracy_score,
|
35
|
+
precision_recall_curve,
|
36
|
+
average_precision_score,
|
37
|
+
)
|
16
38
|
from imblearn.over_sampling import SMOTE
|
17
39
|
from sklearn.pipeline import Pipeline
|
18
40
|
from collections import defaultdict
|
19
|
-
from sklearn.preprocessing import StandardScaler
|
20
|
-
from typing import Dict, Any, Optional,List
|
41
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
42
|
+
from typing import Dict, Any, Optional, List, Union
|
21
43
|
import numpy as np
|
22
44
|
import pandas as pd
|
23
|
-
from . import ips
|
45
|
+
from . import ips
|
24
46
|
from . import plot
|
25
47
|
import matplotlib.pyplot as plt
|
26
48
|
import seaborn as sns
|
27
|
-
|
49
|
+
|
50
|
+
plt.style.use("paper")
|
28
51
|
import logging
|
29
52
|
import warnings
|
30
|
-
|
53
|
+
|
54
|
+
logging.basicConfig(
|
55
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
56
|
+
)
|
31
57
|
logger = logging.getLogger()
|
32
58
|
|
33
59
|
# Ignore specific warnings (UserWarning in this case)
|
@@ -35,7 +61,10 @@ warnings.filterwarnings("ignore", category=UserWarning)
|
|
35
61
|
from sklearn.tree import DecisionTreeClassifier
|
36
62
|
from sklearn.neighbors import KNeighborsClassifier
|
37
63
|
|
38
|
-
|
64
|
+
|
65
|
+
def features_knn(
|
66
|
+
x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
|
67
|
+
) -> pd.DataFrame:
|
39
68
|
"""
|
40
69
|
A distance-based classifier that assigns labels based on the majority label of nearest neighbors.
|
41
70
|
when to use:
|
@@ -46,76 +75,99 @@ def features_knn(X_train: pd.DataFrame, y_train: pd.Series, knn_params: dict) ->
|
|
46
75
|
Fits KNeighborsClassifier and approximates feature influence using permutation importance.
|
47
76
|
"""
|
48
77
|
knn = KNeighborsClassifier(**knn_params)
|
49
|
-
knn.fit(
|
50
|
-
importances = permutation_importance(
|
51
|
-
|
78
|
+
knn.fit(x_train, y_train)
|
79
|
+
importances = permutation_importance(
|
80
|
+
knn, x_train, y_train, n_repeats=30, random_state=1, scoring="accuracy"
|
81
|
+
)
|
82
|
+
return pd.DataFrame(
|
83
|
+
{"feature": x_train.columns, "importance": importances.importances_mean}
|
84
|
+
).sort_values(by="importance", ascending=False)
|
85
|
+
|
52
86
|
|
53
87
|
#! 1. Linear and Regularized Regression Methods
|
54
88
|
# 1.1 Lasso
|
55
|
-
def features_lasso(
|
89
|
+
def features_lasso(
|
90
|
+
x_train: pd.DataFrame, y_train: pd.Series, lasso_params: dict
|
91
|
+
) -> np.ndarray:
|
56
92
|
"""
|
57
|
-
Lasso (Least Absolute Shrinkage and Selection Operator):
|
58
|
-
A regularized linear regression method that uses L1 penalty to shrink coefficients, effectively
|
93
|
+
Lasso (Least Absolute Shrinkage and Selection Operator):
|
94
|
+
A regularized linear regression method that uses L1 penalty to shrink coefficients, effectively
|
59
95
|
performing feature selection by zeroing out less important ones.
|
60
96
|
"""
|
61
97
|
lasso = LassoCV(**lasso_params)
|
62
|
-
lasso.fit(
|
98
|
+
lasso.fit(x_train, y_train)
|
63
99
|
# Get non-zero coefficients and their corresponding features
|
64
100
|
coefficients = lasso.coef_
|
65
|
-
importance_df = pd.DataFrame(
|
66
|
-
"feature":
|
67
|
-
|
68
|
-
|
69
|
-
|
101
|
+
importance_df = pd.DataFrame(
|
102
|
+
{"feature": x_train.columns, "importance": np.abs(coefficients)}
|
103
|
+
)
|
104
|
+
return importance_df[importance_df["importance"] > 0].sort_values(
|
105
|
+
by="importance", ascending=False
|
106
|
+
)
|
107
|
+
|
70
108
|
|
71
109
|
# 1.2 Ridge regression
|
72
|
-
def features_ridge(
|
110
|
+
def features_ridge(
|
111
|
+
x_train: pd.DataFrame, y_train: pd.Series, ridge_params: dict
|
112
|
+
) -> np.ndarray:
|
73
113
|
"""
|
74
|
-
Ridge Regression: A linear regression technique that applies L2 regularization, reducing coefficient
|
114
|
+
Ridge Regression: A linear regression technique that applies L2 regularization, reducing coefficient
|
75
115
|
magnitudes to avoid overfitting, especially with multicollinearity among features.
|
76
116
|
"""
|
77
117
|
from sklearn.linear_model import RidgeCV
|
118
|
+
|
78
119
|
ridge = RidgeCV(**ridge_params)
|
79
|
-
ridge.fit(
|
80
|
-
|
120
|
+
ridge.fit(x_train, y_train)
|
121
|
+
|
81
122
|
# Get the coefficients
|
82
123
|
coefficients = ridge.coef_
|
83
|
-
|
124
|
+
|
84
125
|
# Create a DataFrame to hold feature importance
|
85
|
-
importance_df = pd.DataFrame(
|
86
|
-
"feature":
|
87
|
-
|
88
|
-
|
89
|
-
|
126
|
+
importance_df = pd.DataFrame(
|
127
|
+
{"feature": x_train.columns, "importance": np.abs(coefficients)}
|
128
|
+
)
|
129
|
+
return importance_df[importance_df["importance"] > 0].sort_values(
|
130
|
+
by="importance", ascending=False
|
131
|
+
)
|
132
|
+
|
90
133
|
|
91
134
|
# 1.3 Elastic Net(Enet)
|
92
|
-
def features_enet(
|
135
|
+
def features_enet(
|
136
|
+
x_train: pd.DataFrame, y_train: pd.Series, enet_params: dict
|
137
|
+
) -> np.ndarray:
|
93
138
|
"""
|
94
|
-
Elastic Net (Enet): Combines L1 and L2 penalties (lasso and ridge) in a linear model, beneficial
|
139
|
+
Elastic Net (Enet): Combines L1 and L2 penalties (lasso and ridge) in a linear model, beneficial
|
95
140
|
when features are highly correlated or for datasets with more features than samples.
|
96
141
|
"""
|
97
142
|
from sklearn.linear_model import ElasticNetCV
|
143
|
+
|
98
144
|
enet = ElasticNetCV(**enet_params)
|
99
|
-
enet.fit(
|
145
|
+
enet.fit(x_train, y_train)
|
100
146
|
# Get the coefficients
|
101
147
|
coefficients = enet.coef_
|
102
148
|
# Create a DataFrame to hold feature importance
|
103
|
-
importance_df = pd.DataFrame(
|
104
|
-
"feature":
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
149
|
+
importance_df = pd.DataFrame(
|
150
|
+
{"feature": x_train.columns, "importance": np.abs(coefficients)}
|
151
|
+
)
|
152
|
+
return importance_df[importance_df["importance"] > 0].sort_values(
|
153
|
+
by="importance", ascending=False
|
154
|
+
)
|
155
|
+
|
156
|
+
|
157
|
+
# 1.4 Partial Least Squares Regression for Generalized Linear Models (plsRglm): Combines regression and
|
109
158
|
# feature reduction, useful for high-dimensional data with correlated features, such as genomics.
|
110
159
|
|
111
160
|
#! 2.Generalized Linear Models and Extensions
|
112
|
-
# 2.1
|
161
|
+
# 2.1
|
162
|
+
|
113
163
|
|
114
164
|
#!3.Tree-Based and Ensemble Methods
|
115
165
|
# 3.1 Random Forest(RF)
|
116
|
-
def features_rf(
|
166
|
+
def features_rf(
|
167
|
+
x_train: pd.DataFrame, y_train: pd.Series, rf_params: dict
|
168
|
+
) -> np.ndarray:
|
117
169
|
"""
|
118
|
-
An ensemble of decision trees that combines predictions from multiple trees for classification or
|
170
|
+
An ensemble of decision trees that combines predictions from multiple trees for classification or
|
119
171
|
regression, effective with high-dimensional, complex datasets.
|
120
172
|
when to use:
|
121
173
|
Handles high-dimensional data well.
|
@@ -125,36 +177,55 @@ def features_rf(X_train: pd.DataFrame, y_train: pd.Series, rf_params: dict) -> n
|
|
125
177
|
Recommended Use: Great for classification problems, especially when you have many features (genes).
|
126
178
|
"""
|
127
179
|
rf = RandomForestClassifier(**rf_params)
|
128
|
-
rf.fit(
|
129
|
-
return pd.DataFrame(
|
180
|
+
rf.fit(x_train, y_train)
|
181
|
+
return pd.DataFrame(
|
182
|
+
{"feature": x_train.columns, "importance": rf.featuress_}
|
183
|
+
).sort_values(by="importance", ascending=False)
|
184
|
+
|
185
|
+
|
130
186
|
# 3.2 Gradient Boosting Trees
|
131
|
-
def features_gradient_boosting(
|
187
|
+
def features_gradient_boosting(
|
188
|
+
x_train: pd.DataFrame, y_train: pd.Series, gb_params: dict
|
189
|
+
) -> pd.DataFrame:
|
132
190
|
"""
|
133
|
-
An ensemble of decision trees that combines predictions from multiple trees for classification or regression, effective with
|
191
|
+
An ensemble of decision trees that combines predictions from multiple trees for classification or regression, effective with
|
134
192
|
high-dimensional, complex datasets.
|
135
193
|
Gradient Boosting
|
136
194
|
Strengths:
|
137
195
|
High predictive accuracy and works well for both classification and regression.
|
138
196
|
Can handle a mixture of numerical and categorical features.
|
139
|
-
Recommended Use:
|
197
|
+
Recommended Use:
|
140
198
|
Effective for complex relationships and when you need a powerful predictive model.
|
141
199
|
Fit Gradient Boosting classifier and return sorted feature importances.
|
142
200
|
Recommended Use: Effective for complex datasets with many features (genes).
|
143
201
|
"""
|
144
202
|
gb = GradientBoostingClassifier(**gb_params)
|
145
|
-
gb.fit(
|
146
|
-
return pd.DataFrame(
|
203
|
+
gb.fit(x_train, y_train)
|
204
|
+
return pd.DataFrame(
|
205
|
+
{"feature": x_train.columns, "importance": gb.feature_importances_}
|
206
|
+
).sort_values(by="importance", ascending=False)
|
207
|
+
|
208
|
+
|
147
209
|
# 3.3 XGBoost
|
148
|
-
def features_xgb(
|
210
|
+
def features_xgb(
|
211
|
+
x_train: pd.DataFrame, y_train: pd.Series, xgb_params: dict
|
212
|
+
) -> pd.DataFrame:
|
149
213
|
"""
|
150
214
|
XGBoost: An advanced gradient boosting technique, faster and more efficient than GBM, with excellent predictive performance on structured data.
|
151
215
|
"""
|
152
216
|
import xgboost as xgb
|
217
|
+
|
153
218
|
xgb_model = xgb.XGBClassifier(**xgb_params)
|
154
|
-
xgb_model.fit(
|
155
|
-
return pd.DataFrame(
|
219
|
+
xgb_model.fit(x_train, y_train)
|
220
|
+
return pd.DataFrame(
|
221
|
+
{"feature": x_train.columns, "importance": xgb_model.feature_importances_}
|
222
|
+
).sort_values(by="importance", ascending=False)
|
223
|
+
|
224
|
+
|
156
225
|
# 3.4.decision tree
|
157
|
-
def features_decision_tree(
|
226
|
+
def features_decision_tree(
|
227
|
+
x_train: pd.DataFrame, y_train: pd.Series, dt_params: dict
|
228
|
+
) -> pd.DataFrame:
|
158
229
|
"""
|
159
230
|
A single decision tree classifier effective for identifying key decision boundaries in data.
|
160
231
|
when to use:
|
@@ -162,58 +233,76 @@ def features_decision_tree(X_train: pd.DataFrame, y_train: pd.Series, dt_params:
|
|
162
233
|
Provides feature importance scores for each feature, though it may overfit on small datasets.
|
163
234
|
Efficient for low to medium-sized datasets, where interpretability of decisions is key.
|
164
235
|
Recommended Use: Useful for interpretable feature importance analysis in smaller or balanced datasets.
|
165
|
-
|
236
|
+
|
166
237
|
Fits DecisionTreeClassifier and returns sorted feature importances.
|
167
238
|
"""
|
168
239
|
dt = DecisionTreeClassifier(**dt_params)
|
169
|
-
dt.fit(
|
170
|
-
return pd.DataFrame(
|
240
|
+
dt.fit(x_train, y_train)
|
241
|
+
return pd.DataFrame(
|
242
|
+
{"feature": x_train.columns, "importance": dt.feature_importances_}
|
243
|
+
).sort_values(by="importance", ascending=False)
|
244
|
+
|
245
|
+
|
171
246
|
# 3.5 bagging
|
172
|
-
def features_bagging(
|
247
|
+
def features_bagging(
|
248
|
+
x_train: pd.DataFrame, y_train: pd.Series, bagging_params: dict
|
249
|
+
) -> pd.DataFrame:
|
173
250
|
"""
|
174
|
-
A bagging ensemble of
|
251
|
+
A bagging ensemble of models, often used with weak learners like decision trees, to reduce variance.
|
175
252
|
when to use:
|
176
253
|
Helps reduce overfitting, especially on high-variance models.
|
177
254
|
Effective when the dataset has numerous features and may benefit from ensemble stability.
|
178
255
|
Recommended Use: Beneficial for high-dimensional or noisy datasets needing ensemble stability.
|
179
|
-
|
256
|
+
|
180
257
|
Fits BaggingClassifier and returns averaged feature importances from underlying estimators if available.
|
181
258
|
"""
|
182
259
|
bagging = BaggingClassifier(**bagging_params)
|
183
|
-
bagging.fit(
|
184
|
-
|
260
|
+
bagging.fit(x_train, y_train)
|
261
|
+
|
185
262
|
# Calculate feature importance by averaging importances across estimators, if feature_importances_ is available.
|
186
263
|
if hasattr(bagging.estimators_[0], "feature_importances_"):
|
187
|
-
importances = np.mean(
|
188
|
-
|
264
|
+
importances = np.mean(
|
265
|
+
[estimator.feature_importances_ for estimator in bagging.estimators_],
|
266
|
+
axis=0,
|
267
|
+
)
|
268
|
+
return pd.DataFrame(
|
269
|
+
{"feature": x_train.columns, "importance": importances}
|
270
|
+
).sort_values(by="importance", ascending=False)
|
189
271
|
else:
|
190
272
|
# If the base estimator does not support feature importances, fallback to permutation importance.
|
191
|
-
importances = permutation_importance(
|
192
|
-
|
273
|
+
importances = permutation_importance(
|
274
|
+
bagging, x_train, y_train, n_repeats=30, random_state=1, scoring="accuracy"
|
275
|
+
)
|
276
|
+
return pd.DataFrame(
|
277
|
+
{"feature": x_train.columns, "importance": importances.importances_mean}
|
278
|
+
).sort_values(by="importance", ascending=False)
|
279
|
+
|
193
280
|
|
194
281
|
#! 4.Support Vector Machines
|
195
|
-
def features_svm(
|
282
|
+
def features_svm(
|
283
|
+
x_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict
|
284
|
+
) -> np.ndarray:
|
196
285
|
"""
|
197
286
|
Suitable for classification tasks where the number of features is much larger than the number of samples.
|
198
287
|
1. Effective in high-dimensional spaces and with clear margin of separation.
|
199
288
|
2. Works well for both linear and non-linear classification (using kernel functions).
|
200
|
-
Select features using RFE with SVM.When combined with SVM, RFE selects features that are most critical for the decision boundary,
|
289
|
+
Select features using RFE with SVM.When combined with SVM, RFE selects features that are most critical for the decision boundary,
|
201
290
|
helping reduce the dataset to a more manageable size without losing much predictive power.
|
202
|
-
SVM (Support Vector Machines),supports various kernels (linear, rbf, poly, and sigmoid), is good at handling high-dimensional
|
291
|
+
SVM (Support Vector Machines),supports various kernels (linear, rbf, poly, and sigmoid), is good at handling high-dimensional
|
203
292
|
data and finding an optimal decision boundary between classes, especially when using the right kernel.
|
204
293
|
kernel: ["linear", "rbf", "poly", "sigmoid"]
|
205
|
-
'linear': simplest kernel that attempts to separate data by drawing a straight line (or hyperplane) between classes. It is effective
|
294
|
+
'linear': simplest kernel that attempts to separate data by drawing a straight line (or hyperplane) between classes. It is effective
|
206
295
|
when the data is linearly separable, meaning the classes can be well divided by a straight boundary.
|
207
296
|
Advantages:
|
208
297
|
- Computationally efficient for large datasets.
|
209
|
-
- Works well when the number of features is high, which is common in genomic data where you may have thousands of genes
|
298
|
+
- Works well when the number of features is high, which is common in genomic data where you may have thousands of genes
|
210
299
|
as features.
|
211
|
-
'rbf': a nonlinear kernel that maps the input data into a higher-dimensional space to find a decision boundary. It works well for
|
300
|
+
'rbf': a nonlinear kernel that maps the input data into a higher-dimensional space to find a decision boundary. It works well for
|
212
301
|
data that is not linearly separable in its original space.
|
213
|
-
Advantages:
|
302
|
+
Advantages:
|
214
303
|
- Handles nonlinear relationships between features and classes
|
215
304
|
- Often better than a linear kernel when there is no clear linear decision boundary in the data.
|
216
|
-
'poly': Polynomial Kernel: computes similarity between data points based on polynomial functions of the input features. It can model
|
305
|
+
'poly': Polynomial Kernel: computes similarity between data points based on polynomial functions of the input features. It can model
|
217
306
|
interactions between features to a certain degree, depending on the polynomial degree chosen.
|
218
307
|
Advantages:
|
219
308
|
- Allows modeling of feature interactions.
|
@@ -221,58 +310,80 @@ def features_svm(X_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict) ->
|
|
221
310
|
'sigmoid': similar to the activation function in neural networks, and it works well when the data follows an S-shaped decision boundary.
|
222
311
|
Advantages:
|
223
312
|
- Can approximate the behavior of neural networks.
|
224
|
-
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
313
|
+
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
225
314
|
S-shaped relationships.
|
226
315
|
"""
|
227
316
|
# SVM (Support Vector Machines)
|
228
|
-
svc = SVC(kernel=rfe_params["kernel"])
|
317
|
+
svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
|
229
318
|
# RFE(Recursive Feature Elimination)
|
230
319
|
selector = RFE(svc, n_features_to_select=rfe_params["n_features_to_select"])
|
231
|
-
selector.fit(
|
232
|
-
return
|
320
|
+
selector.fit(x_train, y_train)
|
321
|
+
return x_train.columns[selector.support_]
|
322
|
+
|
323
|
+
|
233
324
|
#! 5.Bayesian and Probabilistic Methods
|
234
|
-
def features_naive_bayes(
|
325
|
+
def features_naive_bayes(x_train: pd.DataFrame, y_train: pd.Series) -> list:
|
235
326
|
"""
|
236
|
-
Naive Bayes: A probabilistic classifier based on Bayes' theorem, assuming independence between features, simple and fast, especially
|
327
|
+
Naive Bayes: A probabilistic classifier based on Bayes' theorem, assuming independence between features, simple and fast, especially
|
237
328
|
effective for text classification and other high-dimensional data.
|
238
329
|
"""
|
239
330
|
from sklearn.naive_bayes import GaussianNB
|
331
|
+
|
240
332
|
nb = GaussianNB()
|
241
|
-
nb.fit(
|
242
|
-
probabilities = nb.predict_proba(
|
243
|
-
|
333
|
+
nb.fit(x_train, y_train)
|
334
|
+
probabilities = nb.predict_proba(x_train)
|
335
|
+
# Limit the number of features safely, choosing the lesser of half the features or all columns
|
336
|
+
n_features = min(x_train.shape[1] // 2, len(x_train.columns))
|
337
|
+
|
338
|
+
# Sort probabilities, then map to valid column indices
|
339
|
+
sorted_indices = np.argsort(probabilities.max(axis=1))[:n_features]
|
340
|
+
|
341
|
+
# Ensure indices are within the column bounds of x_train
|
342
|
+
valid_indices = sorted_indices[sorted_indices < len(x_train.columns)]
|
343
|
+
|
344
|
+
return x_train.columns[valid_indices]
|
345
|
+
|
346
|
+
|
244
347
|
#! 6.Linear Discriminant Analysis (LDA)
|
245
|
-
def features_lda(
|
348
|
+
def features_lda(x_train: pd.DataFrame, y_train: pd.Series) -> list:
|
246
349
|
"""
|
247
|
-
Linear Discriminant Analysis (LDA): Projects data onto a lower-dimensional space to maximize class separability, often used as a dimensionality
|
350
|
+
Linear Discriminant Analysis (LDA): Projects data onto a lower-dimensional space to maximize class separability, often used as a dimensionality
|
248
351
|
reduction technique before classification on high-dimensional data.
|
249
352
|
"""
|
250
353
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
354
|
+
|
251
355
|
lda = LinearDiscriminantAnalysis()
|
252
|
-
lda.fit(
|
356
|
+
lda.fit(x_train, y_train)
|
253
357
|
coef = lda.coef_.flatten()
|
254
358
|
# Create a DataFrame to hold feature importance
|
255
|
-
importance_df = pd.DataFrame(
|
256
|
-
"feature":
|
257
|
-
|
258
|
-
})
|
259
|
-
|
260
|
-
return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
|
359
|
+
importance_df = pd.DataFrame(
|
360
|
+
{"feature": x_train.columns, "importance": np.abs(coef)}
|
361
|
+
)
|
261
362
|
|
262
|
-
|
363
|
+
return importance_df[importance_df["importance"] > 0].sort_values(
|
364
|
+
by="importance", ascending=False
|
365
|
+
)
|
366
|
+
|
367
|
+
|
368
|
+
def features_adaboost(
|
369
|
+
x_train: pd.DataFrame, y_train: pd.Series, adaboost_params: dict
|
370
|
+
) -> pd.DataFrame:
|
263
371
|
"""
|
264
372
|
AdaBoost
|
265
373
|
Strengths:
|
266
374
|
Combines multiple weak learners to create a strong classifier.
|
267
375
|
Focuses on examples that are hard to classify, improving overall performance.
|
268
|
-
Recommended Use:
|
269
|
-
Can be effective for boosting weak
|
376
|
+
Recommended Use:
|
377
|
+
Can be effective for boosting weak models in a genomics context.
|
270
378
|
Fit AdaBoost classifier and return sorted feature importances.
|
271
379
|
Recommended Use: Great for classification problems with a large number of features (genes).
|
272
380
|
"""
|
273
381
|
ada = AdaBoostClassifier(**adaboost_params)
|
274
|
-
ada.fit(
|
275
|
-
return pd.DataFrame(
|
382
|
+
ada.fit(x_train, y_train)
|
383
|
+
return pd.DataFrame(
|
384
|
+
{"feature": x_train.columns, "importance": ada.feature_importances_}
|
385
|
+
).sort_values(by="importance", ascending=False)
|
386
|
+
|
276
387
|
|
277
388
|
import torch
|
278
389
|
import torch.nn as nn
|
@@ -280,32 +391,30 @@ import torch.optim as optim
|
|
280
391
|
from torch.utils.data import DataLoader, TensorDataset
|
281
392
|
from skorch import NeuralNetClassifier # sklearn compatible
|
282
393
|
|
394
|
+
|
283
395
|
class DNNClassifier(nn.Module):
|
284
396
|
def __init__(self, input_dim, hidden_dim=128, output_dim=2, dropout_rate=0.5):
|
285
397
|
super(DNNClassifier, self).__init__()
|
286
|
-
|
398
|
+
|
287
399
|
self.hidden_layer1 = nn.Sequential(
|
288
400
|
nn.Linear(input_dim, hidden_dim),
|
289
401
|
nn.ReLU(),
|
290
402
|
nn.Dropout(dropout_rate),
|
291
403
|
nn.Linear(hidden_dim, hidden_dim),
|
292
|
-
nn.ReLU()
|
404
|
+
nn.ReLU(),
|
293
405
|
)
|
294
|
-
|
406
|
+
|
295
407
|
self.hidden_layer2 = nn.Sequential(
|
296
|
-
nn.Linear(hidden_dim, hidden_dim),
|
297
|
-
nn.ReLU(),
|
298
|
-
nn.Dropout(dropout_rate)
|
408
|
+
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(dropout_rate)
|
299
409
|
)
|
300
|
-
|
410
|
+
|
301
411
|
# Adding a residual connection between hidden layers
|
302
412
|
self.residual = nn.Linear(input_dim, hidden_dim)
|
303
|
-
|
413
|
+
|
304
414
|
self.output_layer = nn.Sequential(
|
305
|
-
nn.Linear(hidden_dim, output_dim),
|
306
|
-
nn.Softmax(dim=1)
|
415
|
+
nn.Linear(hidden_dim, output_dim), nn.Softmax(dim=1)
|
307
416
|
)
|
308
|
-
|
417
|
+
|
309
418
|
def forward(self, x):
|
310
419
|
residual = self.residual(x)
|
311
420
|
x = self.hidden_layer1(x)
|
@@ -314,64 +423,77 @@ class DNNClassifier(nn.Module):
|
|
314
423
|
x = self.output_layer(x)
|
315
424
|
return x
|
316
425
|
|
317
|
-
|
426
|
+
|
427
|
+
def validate_classifier(
|
428
|
+
clf,
|
429
|
+
x_train: pd.DataFrame,
|
430
|
+
y_train: pd.Series,
|
431
|
+
x_test: pd.DataFrame,
|
432
|
+
y_test: pd.Series,
|
433
|
+
metrics: list = ["accuracy", "precision", "recall", "f1", "roc_auc"],
|
434
|
+
cv_folds: int = 5,
|
435
|
+
) -> dict:
|
318
436
|
"""
|
319
437
|
Perform cross-validation for a given classifier and return average scores for specified metrics on training data.
|
320
438
|
Then fit the best model on the full training data and evaluate it on the test set.
|
321
|
-
|
439
|
+
|
322
440
|
Parameters:
|
323
441
|
- clf: The classifier to be validated.
|
324
|
-
-
|
442
|
+
- x_train: Training features.
|
325
443
|
- y_train: Training labels.
|
326
|
-
-
|
444
|
+
- x_test: Test features.
|
327
445
|
- y_test: Test labels.
|
328
446
|
- metrics: List of metrics to evaluate (e.g., ['accuracy', 'roc_auc']).
|
329
447
|
- cv_folds: Number of cross-validation folds.
|
330
|
-
|
448
|
+
|
331
449
|
Returns:
|
332
450
|
- results: Dictionary containing average cv_train_scores and cv_test_scores.
|
333
451
|
"""
|
334
452
|
cv_train_scores = {metric: [] for metric in metrics}
|
335
453
|
skf = StratifiedKFold(n_splits=cv_folds)
|
336
|
-
# Perform cross-validation
|
454
|
+
# Perform cross-validation
|
337
455
|
for metric in metrics:
|
338
456
|
try:
|
339
457
|
if metric == "roc_auc" and len(set(y_train)) == 2:
|
340
|
-
scores = cross_val_score(
|
341
|
-
|
458
|
+
scores = cross_val_score(
|
459
|
+
clf, x_train, y_train, cv=skf, scoring="roc_auc"
|
460
|
+
)
|
461
|
+
cv_train_scores[metric] = (
|
462
|
+
np.nanmean(scores) if not np.isnan(scores).all() else float("nan")
|
463
|
+
)
|
342
464
|
else:
|
343
|
-
score = cross_val_score(clf,
|
465
|
+
score = cross_val_score(clf, x_train, y_train, cv=skf, scoring=metric)
|
344
466
|
cv_train_scores[metric] = score.mean()
|
345
467
|
except Exception as e:
|
346
|
-
cv_train_scores[metric] = float(
|
347
|
-
clf.fit(
|
348
|
-
|
468
|
+
cv_train_scores[metric] = float("nan")
|
469
|
+
clf.fit(x_train, y_train)
|
470
|
+
|
349
471
|
# Evaluate on the test set
|
350
472
|
cv_test_scores = {}
|
351
473
|
for metric in metrics:
|
352
474
|
if metric == "roc_auc" and len(set(y_test)) == 2:
|
353
475
|
try:
|
354
|
-
y_prob=clf.predict_proba(
|
355
|
-
cv_test_scores[metric] = roc_auc_score(y_test,y_prob)
|
476
|
+
y_prob = clf.predict_proba(x_test)[:, 1]
|
477
|
+
cv_test_scores[metric] = roc_auc_score(y_test, y_prob)
|
356
478
|
except AttributeError:
|
357
|
-
cv_test_scores[metric]=float(
|
479
|
+
cv_test_scores[metric] = float("nan")
|
358
480
|
else:
|
359
|
-
score_func = globals().get(
|
481
|
+
score_func = globals().get(
|
482
|
+
f"{metric}_score"
|
483
|
+
) # Fetching the appropriate scoring function
|
360
484
|
if score_func:
|
361
485
|
try:
|
362
|
-
y_pred = clf.predict(
|
486
|
+
y_pred = clf.predict(x_test)
|
363
487
|
cv_test_scores[metric] = score_func(y_test, y_pred)
|
364
488
|
except Exception as e:
|
365
|
-
cv_test_scores[metric] = float(
|
489
|
+
cv_test_scores[metric] = float("nan")
|
366
490
|
|
367
491
|
# Combine results
|
368
|
-
results = {
|
369
|
-
'cv_train_scores': cv_train_scores,
|
370
|
-
'cv_test_scores': cv_test_scores
|
371
|
-
}
|
492
|
+
results = {"cv_train_scores": cv_train_scores, "cv_test_scores": cv_test_scores}
|
372
493
|
return results
|
373
494
|
|
374
|
-
|
495
|
+
|
496
|
+
def get_models(
|
375
497
|
random_state=1,
|
376
498
|
cls=[
|
377
499
|
"lasso",
|
@@ -383,25 +505,36 @@ def get_classifiers(
|
|
383
505
|
"Support Vector Machine(svm)",
|
384
506
|
"naive bayes",
|
385
507
|
"Linear Discriminant Analysis (lda)",
|
386
|
-
"adaboost",
|
508
|
+
"adaboost",
|
509
|
+
"DecisionTree",
|
510
|
+
"KNeighbors",
|
511
|
+
"Bagging",
|
387
512
|
],
|
388
513
|
):
|
389
514
|
from sklearn.ensemble import (
|
390
515
|
RandomForestClassifier,
|
391
516
|
GradientBoostingClassifier,
|
392
517
|
AdaBoostClassifier,
|
393
|
-
BaggingClassifier
|
518
|
+
BaggingClassifier,
|
394
519
|
)
|
395
520
|
from sklearn.svm import SVC
|
396
|
-
from sklearn.linear_model import
|
521
|
+
from sklearn.linear_model import (
|
522
|
+
LogisticRegression,
|
523
|
+
Lasso,
|
524
|
+
RidgeClassifierCV,
|
525
|
+
ElasticNet,
|
526
|
+
)
|
397
527
|
from sklearn.naive_bayes import GaussianNB
|
398
528
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
399
529
|
import xgboost as xgb
|
400
530
|
from sklearn.tree import DecisionTreeClassifier
|
401
531
|
from sklearn.neighbors import KNeighborsClassifier
|
532
|
+
|
402
533
|
res_cls = {}
|
403
|
-
|
404
|
-
"Lasso": LogisticRegression(
|
534
|
+
model_all = {
|
535
|
+
"Lasso": LogisticRegression(
|
536
|
+
penalty="l1", solver="saga", random_state=random_state
|
537
|
+
),
|
405
538
|
"Ridge": RidgeClassifierCV(),
|
406
539
|
"Elastic Net (Enet)": ElasticNet(random_state=random_state),
|
407
540
|
"Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
|
@@ -411,23 +544,25 @@ def get_classifiers(
|
|
411
544
|
"Naive Bayes": GaussianNB(),
|
412
545
|
"Linear Discriminant Analysis (LDA)": LinearDiscriminantAnalysis(),
|
413
546
|
"AdaBoost": AdaBoostClassifier(random_state=random_state, algorithm="SAMME"),
|
414
|
-
"DecisionTree":DecisionTreeClassifier(),
|
547
|
+
"DecisionTree": DecisionTreeClassifier(),
|
415
548
|
"KNeighbors": KNeighborsClassifier(n_neighbors=5),
|
416
549
|
"Bagging": BaggingClassifier(),
|
417
550
|
}
|
418
|
-
print("Using default
|
551
|
+
print("Using default models:")
|
419
552
|
for cls_name in cls:
|
420
|
-
cls_name = ips.strcmp(cls_name, list(
|
421
|
-
res_cls[cls_name] =
|
553
|
+
cls_name = ips.strcmp(cls_name, list(model_all.keys()))[0]
|
554
|
+
res_cls[cls_name] = model_all[cls_name]
|
422
555
|
print(f"- {cls_name}")
|
423
556
|
return res_cls
|
424
557
|
|
558
|
+
|
425
559
|
def get_features(
|
426
|
-
X: pd.DataFrame,
|
427
|
-
y: pd.Series,
|
560
|
+
X: Union[pd.DataFrame, np.ndarray], # n_samples X n_features
|
561
|
+
y: Union[pd.Series, np.ndarray, list], # n_samples X n_features
|
428
562
|
test_size: float = 0.2,
|
429
563
|
random_state: int = 1,
|
430
564
|
n_features: int = 10,
|
565
|
+
fill_missing=True,
|
431
566
|
rf_params: Optional[Dict] = None,
|
432
567
|
rfe_params: Optional[Dict] = None,
|
433
568
|
lasso_params: Optional[Dict] = None,
|
@@ -439,169 +574,338 @@ def get_features(
|
|
439
574
|
dt_params: Optional[Dict] = None,
|
440
575
|
bagging_params: Optional[Dict] = None,
|
441
576
|
knn_params: Optional[Dict] = None,
|
442
|
-
cls: list=[
|
443
|
-
"lasso",
|
444
|
-
"
|
445
|
-
"Elastic Net(Enet)",
|
446
|
-
"gradient Boosting",
|
447
|
-
"Random forest (rf)",
|
448
|
-
"XGBoost (xgb)",
|
449
|
-
"Support Vector Machine(svm)",
|
450
|
-
"naive bayes",
|
451
|
-
"Linear Discriminant Analysis (lda)",
|
452
|
-
"adaboost","DecisionTree","KNeighbors","Bagging"
|
453
|
-
],
|
577
|
+
cls: list = [
|
578
|
+
"lasso","ridge","Elastic Net(Enet)","gradient Boosting","Random forest (rf)","XGBoost (xgb)","Support Vector Machine(svm)",
|
579
|
+
"naive bayes","Linear Discriminant Analysis (lda)","adaboost","DecisionTree","KNeighbors","Bagging"],
|
454
580
|
metrics: Optional[List[str]] = None,
|
455
581
|
cv_folds: int = 5,
|
456
|
-
strict:bool=False,
|
457
|
-
n_shared:int=2,
|
582
|
+
strict: bool = False,
|
583
|
+
n_shared: int = 2, # 只要有两个方法有重合,就纳入common genes
|
458
584
|
use_selected_features: bool = True,
|
459
|
-
|
585
|
+
plot_: bool = True,
|
586
|
+
dir_save:str="./") -> dict:
|
460
587
|
"""
|
461
|
-
Master function to perform feature selection and validate
|
588
|
+
Master function to perform feature selection and validate models.
|
462
589
|
"""
|
590
|
+
from sklearn.compose import ColumnTransformer
|
591
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
592
|
+
|
593
|
+
# Ensure X and y are DataFrames/Series for consistency
|
594
|
+
if isinstance(X, np.ndarray):
|
595
|
+
X = pd.DataFrame(X)
|
596
|
+
if isinstance(y, (np.ndarray, list)):
|
597
|
+
y = pd.Series(y)
|
598
|
+
|
599
|
+
# fill na
|
600
|
+
if fill_missing:
|
601
|
+
ips.df_fillna(data=X,method='knn',inplace=True,axis=0)
|
602
|
+
if isinstance(y, str) and y in X.columns:
|
603
|
+
y_col_name=y
|
604
|
+
y=X[y]
|
605
|
+
y=ips.df_encoder(pd.DataFrame(y),method='dummy')
|
606
|
+
X = X.drop(y_col_name,axis=1)
|
607
|
+
else:
|
608
|
+
y=ips.df_encoder(pd.DataFrame(y),method='dummy').values.ravel()
|
609
|
+
y = y.loc[X.index] # Align y with X after dropping rows with missing values in X
|
610
|
+
y = y.ravel() if isinstance(y, np.ndarray) else y.values.ravel()
|
611
|
+
|
612
|
+
if X.shape[0] != len(y):
|
613
|
+
raise ValueError("X and y must have the same number of samples (rows).")
|
614
|
+
|
615
|
+
# #! # Check for non-numeric columns in X and apply one-hot encoding if needed
|
616
|
+
# Check if any column in X is non-numeric
|
617
|
+
if any(not np.issubdtype(dtype, np.number) for dtype in X.dtypes):
|
618
|
+
X = pd.get_dummies(X, drop_first=True)
|
619
|
+
print(X.shape)
|
620
|
+
|
621
|
+
# #!alternative: # Identify categorical and numerical columns
|
622
|
+
# categorical_cols = X.select_dtypes(include=["object", "category"]).columns
|
623
|
+
# numerical_cols = X.select_dtypes(include=["number"]).columns
|
624
|
+
|
625
|
+
# # Define preprocessing pipeline
|
626
|
+
# preprocessor = ColumnTransformer(
|
627
|
+
# transformers=[
|
628
|
+
# ("num", StandardScaler(), numerical_cols),
|
629
|
+
# ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
|
630
|
+
# ]
|
631
|
+
# )
|
632
|
+
# # Preprocess the data
|
633
|
+
# X = preprocessor.fit_transform(X)
|
634
|
+
|
463
635
|
# Split data into training and test sets
|
464
|
-
|
636
|
+
x_train, x_test, y_train, y_test = train_test_split(
|
465
637
|
X, y, test_size=test_size, random_state=random_state
|
466
638
|
)
|
467
639
|
# Standardize features
|
468
640
|
scaler = StandardScaler()
|
469
|
-
|
470
|
-
|
471
|
-
|
641
|
+
x_train_scaled = scaler.fit_transform(x_train)
|
642
|
+
x_test_scaled = scaler.transform(x_test)
|
643
|
+
|
472
644
|
# Convert back to DataFrame for consistency
|
473
|
-
|
474
|
-
|
645
|
+
x_train = pd.DataFrame(x_train_scaled, columns=x_train.columns)
|
646
|
+
x_test = pd.DataFrame(x_test_scaled, columns=x_test.columns)
|
475
647
|
|
476
648
|
rf_defaults = {"n_estimators": 100, "random_state": random_state}
|
477
649
|
rfe_defaults = {"kernel": "linear", "n_features_to_select": n_features}
|
478
650
|
lasso_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
|
479
651
|
ridge_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
|
480
652
|
enet_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
|
481
|
-
xgb_defaults = {
|
653
|
+
xgb_defaults = {
|
654
|
+
"n_estimators": 100,
|
655
|
+
"use_label_encoder": False,
|
656
|
+
"eval_metric": "logloss",
|
657
|
+
"random_state": random_state,
|
658
|
+
}
|
482
659
|
gb_defaults = {"n_estimators": 100, "random_state": random_state}
|
483
660
|
adaboost_defaults = {"n_estimators": 50, "random_state": random_state}
|
484
661
|
dt_defaults = {"max_depth": None, "random_state": random_state}
|
485
662
|
bagging_defaults = {"n_estimators": 50, "random_state": random_state}
|
486
663
|
knn_defaults = {"n_neighbors": 5}
|
487
664
|
rf_params, rfe_params = rf_params or rf_defaults, rfe_params or rfe_defaults
|
488
|
-
lasso_params, ridge_params =
|
665
|
+
lasso_params, ridge_params = (
|
666
|
+
lasso_params or lasso_defaults,
|
667
|
+
ridge_params or ridge_defaults,
|
668
|
+
)
|
489
669
|
enet_params, xgb_params = enet_params or enet_defaults, xgb_params or xgb_defaults
|
490
|
-
gb_params, adaboost_params =
|
670
|
+
gb_params, adaboost_params = (
|
671
|
+
gb_params or gb_defaults,
|
672
|
+
adaboost_params or adaboost_defaults,
|
673
|
+
)
|
491
674
|
dt_params = dt_params or dt_defaults
|
492
675
|
bagging_params = bagging_params or bagging_defaults
|
493
676
|
knn_params = knn_params or knn_defaults
|
494
677
|
|
495
|
-
cls_
|
496
|
-
|
497
|
-
|
678
|
+
cls_ = [
|
679
|
+
"lasso",
|
680
|
+
"ridge",
|
681
|
+
"Elastic Net(Enet)",
|
682
|
+
"Gradient Boosting",
|
683
|
+
"Random Forest (rf)",
|
684
|
+
"XGBoost (xgb)",
|
685
|
+
"Support Vector Machine(svm)",
|
686
|
+
"Naive Bayes",
|
687
|
+
"Linear Discriminant Analysis (lda)",
|
688
|
+
"adaboost",
|
689
|
+
]
|
690
|
+
cls = [ips.strcmp(i, cls_)[0] for i in cls]
|
498
691
|
|
499
692
|
# Lasso Feature Selection
|
500
|
-
lasso_importances =
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
693
|
+
lasso_importances = (
|
694
|
+
features_lasso(x_train, y_train, lasso_params)
|
695
|
+
if "lasso" in cls
|
696
|
+
else pd.DataFrame()
|
697
|
+
)
|
698
|
+
lasso_selected_features = (
|
699
|
+
lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
|
700
|
+
)
|
701
|
+
# Ridge
|
702
|
+
ridge_importances = (
|
703
|
+
features_ridge(x_train, y_train, ridge_params)
|
704
|
+
if "ridge" in cls
|
705
|
+
else pd.DataFrame()
|
706
|
+
)
|
707
|
+
selected_ridge_features = (
|
708
|
+
ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
|
709
|
+
)
|
505
710
|
# Elastic Net
|
506
|
-
enet_importances=
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
711
|
+
enet_importances = (
|
712
|
+
features_enet(x_train, y_train, enet_params)
|
713
|
+
if "Enet" in cls
|
714
|
+
else pd.DataFrame()
|
715
|
+
)
|
716
|
+
selected_enet_features = (
|
717
|
+
enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
|
718
|
+
)
|
719
|
+
# Random Forest Feature Importance
|
720
|
+
rf_importances = (
|
721
|
+
features_rf(x_train, y_train, rf_params)
|
722
|
+
if "Random Forest" in cls
|
723
|
+
else pd.DataFrame()
|
724
|
+
)
|
725
|
+
top_rf_features = (
|
726
|
+
rf_importances.head(n_features)["feature"].values
|
727
|
+
if "Random Forest" in cls
|
728
|
+
else []
|
729
|
+
)
|
730
|
+
# Gradient Boosting Feature Importance
|
731
|
+
gb_importances = (
|
732
|
+
features_gradient_boosting(x_train, y_train, gb_params)
|
733
|
+
if "Gradient Boosting" in cls
|
734
|
+
else pd.DataFrame()
|
735
|
+
)
|
736
|
+
top_gb_features = (
|
737
|
+
gb_importances.head(n_features)["feature"].values
|
738
|
+
if "Gradient Boosting" in cls
|
739
|
+
else []
|
740
|
+
)
|
514
741
|
# xgb
|
515
|
-
xgb_importances =
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
742
|
+
xgb_importances = (
|
743
|
+
features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
|
744
|
+
)
|
745
|
+
top_xgb_features = (
|
746
|
+
xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
|
747
|
+
)
|
748
|
+
|
749
|
+
# SVM with RFE
|
750
|
+
selected_svm_features = (
|
751
|
+
features_svm(x_train, y_train, rfe_params) if "svm" in cls else []
|
752
|
+
)
|
520
753
|
# Naive Bayes
|
521
|
-
selected_naive_bayes_features=
|
754
|
+
selected_naive_bayes_features = (
|
755
|
+
features_naive_bayes(x_train, y_train) if "Naive Bayes" in cls else []
|
756
|
+
)
|
522
757
|
# lda: linear discriminant analysis
|
523
|
-
lda_importances=features_lda(
|
524
|
-
selected_lda_features=
|
525
|
-
|
526
|
-
|
527
|
-
|
758
|
+
lda_importances = features_lda(x_train, y_train) if "lda" in cls else pd.DataFrame()
|
759
|
+
selected_lda_features = (
|
760
|
+
lda_importances.head(n_features)["feature"].values if "lda" in cls else []
|
761
|
+
)
|
762
|
+
# AdaBoost Feature Importance
|
763
|
+
adaboost_importances = (
|
764
|
+
features_adaboost(x_train, y_train, adaboost_params)
|
765
|
+
if "AdaBoost" in cls
|
766
|
+
else pd.DataFrame()
|
767
|
+
)
|
768
|
+
top_adaboost_features = (
|
769
|
+
adaboost_importances.head(n_features)["feature"].values
|
770
|
+
if "AdaBoost" in cls
|
771
|
+
else []
|
772
|
+
)
|
528
773
|
# Decision Tree Feature Importance
|
529
|
-
dt_importances =
|
530
|
-
|
774
|
+
dt_importances = (
|
775
|
+
features_decision_tree(x_train, y_train, dt_params)
|
776
|
+
if "Decision Tree" in cls
|
777
|
+
else pd.DataFrame()
|
778
|
+
)
|
779
|
+
top_dt_features = (
|
780
|
+
dt_importances.head(n_features)["feature"].values
|
781
|
+
if "Decision Tree" in cls
|
782
|
+
else []
|
783
|
+
)
|
531
784
|
# Bagging Feature Importance
|
532
|
-
bagging_importances =
|
533
|
-
|
785
|
+
bagging_importances = (
|
786
|
+
features_bagging(x_train, y_train, bagging_params)
|
787
|
+
if "Bagging" in cls
|
788
|
+
else pd.DataFrame()
|
789
|
+
)
|
790
|
+
top_bagging_features = (
|
791
|
+
bagging_importances.head(n_features)["feature"].values
|
792
|
+
if "Bagging" in cls
|
793
|
+
else []
|
794
|
+
)
|
534
795
|
# KNN Feature Importance via Permutation
|
535
|
-
knn_importances =
|
536
|
-
|
796
|
+
knn_importances = (
|
797
|
+
features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
|
798
|
+
)
|
799
|
+
top_knn_features = (
|
800
|
+
knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
|
801
|
+
)
|
537
802
|
|
538
803
|
#! Find common features
|
539
|
-
common_features = ips.shared(
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
804
|
+
common_features = ips.shared(
|
805
|
+
lasso_selected_features,
|
806
|
+
selected_ridge_features,
|
807
|
+
selected_enet_features,
|
808
|
+
top_rf_features,
|
809
|
+
top_gb_features,
|
810
|
+
top_xgb_features,
|
811
|
+
selected_svm_features,
|
812
|
+
selected_naive_bayes_features,
|
813
|
+
selected_lda_features,
|
814
|
+
top_adaboost_features,
|
815
|
+
top_dt_features,
|
816
|
+
top_bagging_features,
|
817
|
+
top_knn_features,
|
818
|
+
strict=strict,
|
819
|
+
n_shared=n_shared,
|
820
|
+
verbose=False
|
821
|
+
)
|
546
822
|
|
547
823
|
# Use selected features or all features for model validation
|
548
|
-
|
549
|
-
|
824
|
+
x_train_selected = (
|
825
|
+
x_train[list(common_features)] if use_selected_features else x_train
|
826
|
+
)
|
827
|
+
x_test_selected = x_test[list(common_features)] if use_selected_features else x_test
|
550
828
|
|
551
829
|
if metrics is None:
|
552
|
-
metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
|
830
|
+
metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
|
553
831
|
|
554
832
|
# Prepare results DataFrame for selected features
|
555
|
-
features_df = pd.DataFrame(
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
833
|
+
features_df = pd.DataFrame(
|
834
|
+
{
|
835
|
+
"type": ["Lasso"] * len(lasso_selected_features)
|
836
|
+
+ ["Ridge"] * len(selected_ridge_features)
|
837
|
+
+ ["Random Forest"] * len(top_rf_features)
|
838
|
+
+ ["Gradient Boosting"] * len(top_gb_features)
|
839
|
+
+ ["Enet"] * len(selected_enet_features)
|
840
|
+
+ ["xgb"] * len(top_xgb_features)
|
841
|
+
+ ["SVM"] * len(selected_svm_features)
|
842
|
+
+ ["Naive Bayes"] * len(selected_naive_bayes_features)
|
843
|
+
+ ["Linear Discriminant Analysis"] * len(selected_lda_features)
|
844
|
+
+ ["AdaBoost"] * len(top_adaboost_features)
|
845
|
+
+ ["Decision Tree"] * len(top_dt_features)
|
846
|
+
+ ["Bagging"] * len(top_bagging_features)
|
847
|
+
+ ["KNN"] * len(top_knn_features),
|
848
|
+
"feature": np.concatenate(
|
849
|
+
[
|
850
|
+
lasso_selected_features,
|
851
|
+
selected_ridge_features,
|
852
|
+
top_rf_features,
|
853
|
+
top_gb_features,
|
854
|
+
selected_enet_features,
|
855
|
+
top_xgb_features,
|
856
|
+
selected_svm_features,
|
857
|
+
selected_naive_bayes_features,
|
858
|
+
selected_lda_features,
|
859
|
+
top_adaboost_features,
|
860
|
+
top_dt_features,
|
861
|
+
top_bagging_features,
|
862
|
+
top_knn_features,
|
863
|
+
]
|
864
|
+
),
|
865
|
+
}
|
866
|
+
)
|
577
867
|
|
578
868
|
#! Validate trained each classifier
|
579
|
-
|
580
|
-
cv_train_results,cv_test_results = [],[]
|
581
|
-
for name, clf in
|
582
|
-
if not
|
583
|
-
cv_scores=validate_classifier(
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
869
|
+
models = get_models(random_state=random_state, cls=cls)
|
870
|
+
cv_train_results, cv_test_results = [], []
|
871
|
+
for name, clf in models.items():
|
872
|
+
if not x_train_selected.empty:
|
873
|
+
cv_scores = validate_classifier(
|
874
|
+
clf,
|
875
|
+
x_train_selected,
|
876
|
+
y_train,
|
877
|
+
x_test_selected,
|
878
|
+
y_test,
|
879
|
+
metrics=metrics,
|
880
|
+
cv_folds=cv_folds,
|
881
|
+
)
|
590
882
|
|
591
883
|
cv_train_score_df = pd.DataFrame(cv_scores["cv_train_scores"], index=[name])
|
592
884
|
cv_test_score_df = pd.DataFrame(cv_scores["cv_test_scores"], index=[name])
|
593
885
|
cv_train_results.append(cv_train_score_df)
|
594
886
|
cv_test_results.append(cv_test_score_df)
|
595
|
-
if all([cv_train_results,
|
596
|
-
cv_train_results_df =
|
597
|
-
|
887
|
+
if all([cv_train_results, cv_test_results]):
|
888
|
+
cv_train_results_df = (
|
889
|
+
pd.concat(cv_train_results)
|
890
|
+
.reset_index()
|
891
|
+
.rename(columns={"index": "Classifier"})
|
892
|
+
)
|
893
|
+
cv_test_results_df = (
|
894
|
+
pd.concat(cv_test_results)
|
895
|
+
.reset_index()
|
896
|
+
.rename(columns={"index": "Classifier"})
|
897
|
+
)
|
598
898
|
#! Store results in the main results dictionary
|
599
899
|
results = {
|
600
900
|
"selected_features": features_df,
|
601
901
|
"cv_train_scores": cv_train_results_df,
|
602
|
-
"cv_test_scores": cv_test_results_df,
|
902
|
+
"cv_test_scores": rank_models(cv_test_results_df,plot_=plot_),
|
603
903
|
"common_features": list(common_features),
|
604
904
|
}
|
905
|
+
if all([plot_,dir_save]):
|
906
|
+
from datetime import datetime
|
907
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
908
|
+
ips.figsave(dir_save+f"features{now_}.pdf")
|
605
909
|
else:
|
606
910
|
results = {
|
607
911
|
"selected_features": pd.DataFrame(),
|
@@ -611,71 +915,75 @@ def get_features(
|
|
611
915
|
}
|
612
916
|
print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
|
613
917
|
return results
|
918
|
+
|
919
|
+
|
614
920
|
#! # usage:
|
615
921
|
# # Get features and common features
|
616
922
|
# results = get_features(X, y)
|
617
923
|
# common_features = results["common_features"]
|
618
924
|
def validate_features(
|
619
|
-
|
925
|
+
x_train: pd.DataFrame,
|
620
926
|
y_train: pd.Series,
|
621
|
-
|
927
|
+
x_true: pd.DataFrame,
|
622
928
|
y_true: pd.Series,
|
623
|
-
common_features:set=None,
|
624
|
-
|
929
|
+
common_features: set = None,
|
930
|
+
models: Optional[Dict[str, Any]] = None,
|
625
931
|
metrics: Optional[list] = None,
|
626
932
|
random_state: int = 1,
|
627
933
|
smote: bool = False,
|
934
|
+
n_jobs:int = -1,
|
628
935
|
plot_: bool = True,
|
629
936
|
class_weight: str = "balanced",
|
630
937
|
) -> dict:
|
631
938
|
"""
|
632
|
-
Validate
|
939
|
+
Validate models using selected features on the validation dataset.
|
633
940
|
|
634
941
|
Parameters:
|
635
|
-
-
|
942
|
+
- x_train (pd.DataFrame): Training feature dataset.
|
636
943
|
- y_train (pd.Series): Training target variable.
|
637
|
-
-
|
944
|
+
- x_true (pd.DataFrame): Validation feature dataset.
|
638
945
|
- y_true (pd.Series): Validation target variable.
|
639
946
|
- common_features (set): Set of common features to use for validation.
|
640
|
-
-
|
947
|
+
- models (dict, optional): Dictionary of models to validate.
|
641
948
|
- metrics (list, optional): List of metrics to compute.
|
642
949
|
- random_state (int): Random state for reproducibility.
|
643
950
|
- plot_ (bool): Option to plot metrics (to be implemented if needed).
|
644
951
|
- class_weight (str or dict): Class weights to handle imbalance.
|
645
952
|
|
646
953
|
"""
|
647
|
-
|
954
|
+
from tqdm import tqdm
|
648
955
|
# Ensure common features are selected
|
649
|
-
common_features = ips.shared(common_features,
|
650
|
-
X_train.columns,
|
651
|
-
X_true.columns,
|
652
|
-
strict=True)
|
956
|
+
common_features = ips.shared(common_features, x_train.columns, x_true.columns, strict=True,verbose=False)
|
653
957
|
|
654
958
|
# Filter the training and validation datasets for the common features
|
655
|
-
|
656
|
-
|
959
|
+
x_train_selected = x_train[common_features]
|
960
|
+
x_true_selected = x_true[common_features]
|
657
961
|
|
658
|
-
if not
|
659
|
-
raise ValueError(
|
660
|
-
|
661
|
-
|
962
|
+
if not x_true_selected.index.equals(y_true.index):
|
963
|
+
raise ValueError(
|
964
|
+
"Index mismatch between validation features and target. Ensure data alignment."
|
965
|
+
)
|
966
|
+
|
967
|
+
y_true = y_true.loc[x_true_selected.index]
|
662
968
|
|
663
969
|
# Handle class imbalance using SMOTE
|
664
970
|
if smote:
|
665
|
-
if
|
971
|
+
if (
|
972
|
+
y_train.value_counts(normalize=True).max() < 0.8
|
973
|
+
): # Threshold to decide if data is imbalanced
|
666
974
|
smote = SMOTE(random_state=random_state)
|
667
|
-
|
668
|
-
|
975
|
+
x_train_resampled, y_train_resampled = smote.fit_resample(
|
976
|
+
x_train_selected, y_train
|
669
977
|
)
|
670
978
|
else:
|
671
979
|
# skip SMOTE
|
672
|
-
|
980
|
+
x_train_resampled, y_train_resampled = x_train_selected, y_train
|
673
981
|
else:
|
674
|
-
|
982
|
+
x_train_resampled, y_train_resampled = x_train_selected, y_train
|
675
983
|
|
676
|
-
# Default
|
677
|
-
if
|
678
|
-
|
984
|
+
# Default models if not provided
|
985
|
+
if models is None:
|
986
|
+
models = {
|
679
987
|
"Random Forest": RandomForestClassifier(
|
680
988
|
class_weight=class_weight, random_state=random_state
|
681
989
|
),
|
@@ -684,86 +992,107 @@ def validate_features(
|
|
684
992
|
class_weight=class_weight, random_state=random_state
|
685
993
|
),
|
686
994
|
"Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
|
687
|
-
"AdaBoost": AdaBoostClassifier(
|
688
|
-
|
689
|
-
|
690
|
-
"
|
691
|
-
|
995
|
+
"AdaBoost": AdaBoostClassifier(
|
996
|
+
random_state=random_state, algorithm="SAMME"
|
997
|
+
),
|
998
|
+
"Lasso": LogisticRegression(
|
999
|
+
penalty="l1", solver="saga", random_state=random_state
|
1000
|
+
),
|
1001
|
+
"Ridge": LogisticRegression(
|
1002
|
+
penalty="l2", solver="saga", random_state=random_state
|
1003
|
+
),
|
1004
|
+
"Elastic Net": LogisticRegression(
|
1005
|
+
penalty="elasticnet",
|
1006
|
+
solver="saga",
|
1007
|
+
l1_ratio=0.5,
|
1008
|
+
random_state=random_state,
|
1009
|
+
),
|
1010
|
+
"XGBoost": xgb.XGBClassifier(eval_metric="logloss"
|
1011
|
+
),
|
692
1012
|
"Naive Bayes": GaussianNB(),
|
693
|
-
"LDA": LinearDiscriminantAnalysis()
|
1013
|
+
"LDA": LinearDiscriminantAnalysis(),
|
694
1014
|
}
|
695
1015
|
|
696
|
-
# Hyperparameter grids for tuning
|
1016
|
+
# Hyperparameter grids for tuning
|
697
1017
|
param_grids = {
|
698
1018
|
"Random Forest": {
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
1019
|
+
"n_estimators": [100, 200, 300, 400, 500],
|
1020
|
+
"max_depth": [None, 3, 5, 10, 20],
|
1021
|
+
"min_samples_split": [2, 5, 10],
|
1022
|
+
"min_samples_leaf": [1, 2, 4],
|
1023
|
+
"class_weight": [None, "balanced"],
|
704
1024
|
},
|
705
1025
|
"SVM": {
|
706
|
-
|
707
|
-
|
708
|
-
|
1026
|
+
"C": [0.01, 0.1, 1, 10, 100, 1000],
|
1027
|
+
"gamma": [0.001, 0.01, 0.1, "scale", "auto"],
|
1028
|
+
"kernel": ["linear", "rbf", "poly"],
|
709
1029
|
},
|
710
1030
|
"Logistic Regression": {
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
1031
|
+
"C": [0.01, 0.1, 1, 10, 100],
|
1032
|
+
"solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
|
1033
|
+
"penalty": ["l1", "l2"],
|
1034
|
+
"max_iter": [100, 200, 300],
|
715
1035
|
},
|
716
1036
|
"Gradient Boosting": {
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
1037
|
+
"n_estimators": [100, 200, 300, 400, 500],
|
1038
|
+
"learning_rate": np.logspace(-3, 0, 4),
|
1039
|
+
"max_depth": [3, 5, 7, 9],
|
1040
|
+
"min_samples_split": [2, 5, 10],
|
721
1041
|
},
|
722
1042
|
"AdaBoost": {
|
723
|
-
|
724
|
-
|
725
|
-
},
|
726
|
-
"Lasso": {
|
727
|
-
'C': np.logspace(-3, 1, 10),
|
728
|
-
'max_iter': [100, 200, 300]
|
729
|
-
},
|
730
|
-
"Ridge": {
|
731
|
-
'C': np.logspace(-3, 1, 10),
|
732
|
-
'max_iter': [100, 200, 300]
|
1043
|
+
"n_estimators": [50, 100, 200, 300, 500],
|
1044
|
+
"learning_rate": np.logspace(-3, 0, 4),
|
733
1045
|
},
|
1046
|
+
"Lasso": {"C": np.logspace(-3, 1, 10), "max_iter": [100, 200, 300]},
|
1047
|
+
"Ridge": {"C": np.logspace(-3, 1, 10), "max_iter": [100, 200, 300]},
|
734
1048
|
"Elastic Net": {
|
735
|
-
|
736
|
-
|
737
|
-
|
1049
|
+
"C": np.logspace(-3, 1, 10),
|
1050
|
+
"l1_ratio": [0.1, 0.5, 0.9],
|
1051
|
+
"max_iter": [100, 200, 300],
|
738
1052
|
},
|
739
1053
|
"XGBoost": {
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
1054
|
+
"n_estimators": [100, 200],
|
1055
|
+
"max_depth": [3, 5, 7],
|
1056
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
1057
|
+
"subsample": [0.8, 1.0],
|
1058
|
+
"colsample_bytree": [0.8, 1.0],
|
745
1059
|
},
|
746
1060
|
"Naive Bayes": {},
|
747
|
-
"LDA": {
|
748
|
-
'solver': ['svd', 'lsqr', 'eigen']
|
749
|
-
}
|
1061
|
+
"LDA": {"solver": ["svd", "lsqr", "eigen"]},
|
750
1062
|
}
|
751
1063
|
# Default metrics if not provided
|
752
1064
|
if metrics is None:
|
753
|
-
metrics = [
|
1065
|
+
metrics = [
|
1066
|
+
"accuracy",
|
1067
|
+
"precision",
|
1068
|
+
"recall",
|
1069
|
+
"f1",
|
1070
|
+
"roc_auc",
|
1071
|
+
"mcc",
|
1072
|
+
"specificity",
|
1073
|
+
"balanced_accuracy",
|
1074
|
+
"pr_auc",
|
1075
|
+
]
|
754
1076
|
|
755
1077
|
results = {}
|
756
1078
|
|
757
1079
|
# Validate each classifier with GridSearchCV
|
758
|
-
for name, clf in
|
1080
|
+
for name, clf in tqdm(
|
1081
|
+
models.items(),
|
1082
|
+
desc="for metric in metrics",
|
1083
|
+
colour="green",
|
1084
|
+
bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
|
1085
|
+
):
|
759
1086
|
print(f"\nValidating {name} on the validation dataset:")
|
760
1087
|
|
761
1088
|
# Check if `predict_proba` method exists; if not, use CalibratedClassifierCV
|
762
1089
|
# 没有predict_proba的分类器,使用 CalibratedClassifierCV 可以获得校准的概率估计。此外,为了使代码更灵活,我们可以在创建分类器
|
763
1090
|
# 时检查 predict_proba 方法是否存在,如果不存在且用户希望计算 roc_auc 或 pr_auc,则启用 CalibratedClassifierCV
|
764
1091
|
if not hasattr(clf, "predict_proba"):
|
765
|
-
print(
|
766
|
-
|
1092
|
+
print(
|
1093
|
+
f"Using CalibratedClassifierCV for {name} due to lack of probability estimates."
|
1094
|
+
)
|
1095
|
+
calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
|
767
1096
|
else:
|
768
1097
|
calibrated_clf = clf
|
769
1098
|
# Stratified K-Fold for cross-validation
|
@@ -771,28 +1100,30 @@ def validate_features(
|
|
771
1100
|
|
772
1101
|
# Create GridSearchCV object
|
773
1102
|
gs = GridSearchCV(
|
774
|
-
estimator=
|
1103
|
+
estimator=calibrated_clf,
|
775
1104
|
param_grid=param_grids[name],
|
776
1105
|
scoring="roc_auc", # Optimize for ROC AUC
|
777
1106
|
cv=skf, # Stratified K-Folds cross-validation
|
778
|
-
n_jobs
|
1107
|
+
n_jobs=n_jobs,
|
779
1108
|
verbose=1,
|
780
1109
|
)
|
781
1110
|
|
782
1111
|
# Fit the model using GridSearchCV
|
783
|
-
gs.fit(
|
1112
|
+
gs.fit(x_train_resampled, y_train_resampled)
|
784
1113
|
# Best estimator from grid search
|
785
1114
|
best_clf = gs.best_estimator_
|
786
1115
|
# Make predictions on the validation set
|
787
|
-
y_pred = best_clf.predict(
|
1116
|
+
y_pred = best_clf.predict(x_true_selected)
|
788
1117
|
# Calculate probabilities for ROC AUC if possible
|
789
1118
|
if hasattr(best_clf, "predict_proba"):
|
790
|
-
y_pred_proba = best_clf.predict_proba(
|
1119
|
+
y_pred_proba = best_clf.predict_proba(x_true_selected)[:, 1]
|
791
1120
|
elif hasattr(best_clf, "decision_function"):
|
792
1121
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
793
|
-
y_pred_proba = best_clf.decision_function(
|
1122
|
+
y_pred_proba = best_clf.decision_function(x_true_selected)
|
794
1123
|
# Ensure y_pred_proba is within 0 and 1 bounds
|
795
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
1124
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
1125
|
+
y_pred_proba.max() - y_pred_proba.min()
|
1126
|
+
)
|
796
1127
|
else:
|
797
1128
|
y_pred_proba = None # No probability output for certain models
|
798
1129
|
|
@@ -802,11 +1133,15 @@ def validate_features(
|
|
802
1133
|
if metric == "accuracy":
|
803
1134
|
validation_scores[metric] = accuracy_score(y_true, y_pred)
|
804
1135
|
elif metric == "precision":
|
805
|
-
validation_scores[metric] = precision_score(
|
1136
|
+
validation_scores[metric] = precision_score(
|
1137
|
+
y_true, y_pred, average="weighted"
|
1138
|
+
)
|
806
1139
|
elif metric == "recall":
|
807
|
-
validation_scores[metric] = recall_score(
|
1140
|
+
validation_scores[metric] = recall_score(
|
1141
|
+
y_true, y_pred, average="weighted"
|
1142
|
+
)
|
808
1143
|
elif metric == "f1":
|
809
|
-
validation_scores[metric] = f1_score(y_true, y_pred, average=
|
1144
|
+
validation_scores[metric] = f1_score(y_true, y_pred, average="weighted")
|
810
1145
|
elif metric == "roc_auc" and y_pred_proba is not None:
|
811
1146
|
validation_scores[metric] = roc_auc_score(y_true, y_pred_proba)
|
812
1147
|
elif metric == "mcc":
|
@@ -816,32 +1151,35 @@ def validate_features(
|
|
816
1151
|
validation_scores[metric] = tn / (tn + fp) # Specificity calculation
|
817
1152
|
elif metric == "balanced_accuracy":
|
818
1153
|
validation_scores[metric] = balanced_accuracy_score(y_true, y_pred)
|
819
|
-
elif metric == "pr_auc"
|
1154
|
+
elif metric == "pr_auc" and y_pred_proba is not None:
|
820
1155
|
precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
|
821
|
-
validation_scores[metric] = average_precision_score(
|
822
|
-
|
1156
|
+
validation_scores[metric] = average_precision_score(
|
1157
|
+
y_true, y_pred_proba
|
1158
|
+
)
|
1159
|
+
|
823
1160
|
# Calculate ROC curve
|
824
|
-
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
1161
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
825
1162
|
if y_pred_proba is not None:
|
826
1163
|
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
827
1164
|
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
828
|
-
lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba)
|
829
|
-
roc_auc=auc(fpr, tpr)
|
830
|
-
roc_info={
|
1165
|
+
lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba,verbose=False)
|
1166
|
+
roc_auc = auc(fpr, tpr)
|
1167
|
+
roc_info = {
|
831
1168
|
"fpr": fpr.tolist(),
|
832
1169
|
"tpr": tpr.tolist(),
|
833
|
-
"auc":roc_auc,
|
834
|
-
"ci95":(lower_ci, upper_ci)
|
1170
|
+
"auc": roc_auc,
|
1171
|
+
"ci95": (lower_ci, upper_ci),
|
835
1172
|
}
|
836
1173
|
# precision-recall curve
|
837
|
-
precision_, recall_, _
|
1174
|
+
precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
|
838
1175
|
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
839
|
-
pr_info = {
|
840
|
-
|
841
|
-
|
842
|
-
|
1176
|
+
pr_info = {
|
1177
|
+
"precision": precision_,
|
1178
|
+
"recall": recall_,
|
1179
|
+
"avg_precision": avg_precision_,
|
1180
|
+
}
|
843
1181
|
else:
|
844
|
-
roc_info,pr_info=None,None
|
1182
|
+
roc_info, pr_info = None, None
|
845
1183
|
results[name] = {
|
846
1184
|
"best_params": gs.best_params_,
|
847
1185
|
"scores": validation_scores,
|
@@ -849,24 +1187,93 @@ def validate_features(
|
|
849
1187
|
"pr_curve": pr_info,
|
850
1188
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
851
1189
|
}
|
852
|
-
|
1190
|
+
|
853
1191
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
854
1192
|
|
855
1193
|
return df_results
|
856
1194
|
|
857
|
-
|
858
|
-
|
1195
|
+
|
1196
|
+
#! usage validate_features()
|
1197
|
+
# Validate models using the validation dataset (X_val, y_val)
|
859
1198
|
# validation_results = validate_features(X, y, X_val, y_val, common_features)
|
860
1199
|
|
861
1200
|
# # If you want to access validation scores
|
862
1201
|
# print(validation_results)
|
1202
|
+
def plot_validate_features(res_val):
|
1203
|
+
"""
|
1204
|
+
plot the results of 'validate_features()'
|
1205
|
+
"""
|
1206
|
+
colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
|
1207
|
+
if res_val.shape[0]>5:
|
1208
|
+
alpha=0
|
1209
|
+
figsize=[8,10]
|
1210
|
+
subplot_layout=[1,2]
|
1211
|
+
ncols=2
|
1212
|
+
bbox_to_anchor=[1.5,0.6]
|
1213
|
+
else:
|
1214
|
+
alpha=0.03
|
1215
|
+
figsize=[10,6]
|
1216
|
+
subplot_layout=[1,1]
|
1217
|
+
ncols=1
|
1218
|
+
bbox_to_anchor=[1,1]
|
1219
|
+
nexttile = plot.subplot(figsize=figsize)
|
1220
|
+
ax = nexttile(subplot_layout[0],subplot_layout[1])
|
1221
|
+
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1222
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1223
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1224
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1225
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1226
|
+
plot_roc_curve(
|
1227
|
+
fpr,tpr,mean_auc,lower_ci,upper_ci,model_name=model_name,
|
1228
|
+
lw=1.5,color=colors[i],alpha=alpha,ax=ax)
|
1229
|
+
plot.figsets(sp=2,legend=dict(loc="upper right", ncols=ncols, fontsize=8, bbox_to_anchor=[1.5,0.6],markerscale=0.8))
|
1230
|
+
# plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
|
1231
|
+
|
1232
|
+
ax = nexttile(subplot_layout[0],subplot_layout[1])
|
1233
|
+
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1234
|
+
plot_pr_curve(
|
1235
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1236
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1237
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1238
|
+
model_name=model_name,
|
1239
|
+
color=colors[i],lw=1.5,alpha=alpha,ax=ax)
|
1240
|
+
plot.figsets(sp=2,legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5,0.5]))
|
1241
|
+
# plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
|
863
1242
|
|
864
|
-
|
865
|
-
|
1243
|
+
def plot_validate_features_single(res_val,figsize=None):
|
1244
|
+
if figsize is None:
|
1245
|
+
nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3)
|
1246
|
+
else:
|
1247
|
+
nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3,figsize=figsize)
|
1248
|
+
for model_name in ips.flatten(res_val["pr_curve"].index):
|
1249
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1250
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1251
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1252
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1253
|
+
|
1254
|
+
# Plotting
|
1255
|
+
plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci, ax=nexttile())
|
1256
|
+
plot.figsets(title=model_name, sp=2)
|
1257
|
+
|
1258
|
+
plot_pr_curve(
|
1259
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1260
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1261
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1262
|
+
model_name=model_name,
|
1263
|
+
ax=nexttile(),
|
1264
|
+
)
|
1265
|
+
plot.figsets(title=model_name, sp=2)
|
1266
|
+
|
1267
|
+
# plot cm
|
1268
|
+
plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
|
1269
|
+
plot.figsets(title=model_name, sp=2)
|
1270
|
+
|
1271
|
+
def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1,verbose=True):
|
866
1272
|
y_true = np.asarray(y_true)
|
867
1273
|
y_pred = np.asarray(y_pred)
|
868
1274
|
bootstrapped_scores = []
|
869
|
-
|
1275
|
+
if verbose:
|
1276
|
+
print("auroc score:", roc_auc_score(y_true, y_pred))
|
870
1277
|
rng = np.random.RandomState(random_state)
|
871
1278
|
for i in range(n_bootstraps):
|
872
1279
|
# bootstrap by sampling with replacement on the prediction indices
|
@@ -887,21 +1294,24 @@ def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95,random_state=1):
|
|
887
1294
|
# Computing the lower and upper bound of the 90% confidence interval
|
888
1295
|
# You can change the bounds percentiles to 0.025 and 0.975 to get
|
889
1296
|
# a 95% confidence interval instead.
|
890
|
-
confidence_lower = sorted_scores[int((1-ci) * len(sorted_scores))]
|
1297
|
+
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
891
1298
|
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
892
|
-
|
1299
|
+
if verbose:
|
1300
|
+
print(
|
893
1301
|
"Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
|
894
1302
|
confidence_lower, confidence_upper
|
895
1303
|
)
|
896
1304
|
)
|
897
1305
|
return confidence_lower, confidence_upper
|
898
1306
|
|
1307
|
+
|
899
1308
|
def plot_roc_curve(
|
900
1309
|
fpr=None,
|
901
1310
|
tpr=None,
|
902
1311
|
mean_auc=None,
|
903
1312
|
lower_ci=None,
|
904
1313
|
upper_ci=None,
|
1314
|
+
model_name=None,
|
905
1315
|
color="#FF8F00",
|
906
1316
|
lw=2,
|
907
1317
|
alpha=0.1,
|
@@ -913,24 +1323,23 @@ def plot_roc_curve(
|
|
913
1323
|
diagonal_color="0.5",
|
914
1324
|
figsize=(5, 5),
|
915
1325
|
ax=None,
|
916
|
-
**kwargs
|
1326
|
+
**kwargs,
|
917
1327
|
):
|
918
1328
|
if ax is None:
|
919
1329
|
fig, ax = plt.subplots(figsize=figsize)
|
920
1330
|
if mean_auc is not None:
|
1331
|
+
model_name = "ROC curve" if model_name is None else model_name
|
921
1332
|
if ci_display:
|
922
|
-
label = (
|
923
|
-
f"ROC curve (AUC = {mean_auc:.3f})\n95% CI: {lower_ci:.3f} - {upper_ci:.3f}"
|
924
|
-
)
|
1333
|
+
label = f"{model_name} (AUC = {mean_auc:.3f})\n95% CI: {lower_ci:.3f} - {upper_ci:.3f}"
|
925
1334
|
else:
|
926
|
-
label = f"
|
1335
|
+
label = f"{model_name} (AUC = {mean_auc:.3f})"
|
927
1336
|
else:
|
928
1337
|
label = None
|
929
1338
|
|
930
1339
|
# Plot ROC curve and the diagonal reference line
|
931
1340
|
ax.fill_between(fpr, tpr, alpha=alpha, color=color)
|
932
|
-
ax.plot([0, 1], [0, 1], color=diagonal_color, linestyle="--")
|
933
|
-
ax.plot(fpr, tpr, color=color, lw=lw, label=label
|
1341
|
+
ax.plot([0, 1], [0, 1], color=diagonal_color, clip_on=False, linestyle="--")
|
1342
|
+
ax.plot(fpr, tpr, color=color, lw=lw, label=label,clip_on=False, **kwargs)
|
934
1343
|
# Setting plot limits, labels, and title
|
935
1344
|
ax.set_xlim([-0.01, 1.0])
|
936
1345
|
ax.set_ylim([0.0, 1.0])
|
@@ -939,7 +1348,9 @@ def plot_roc_curve(
|
|
939
1348
|
ax.set_title(title)
|
940
1349
|
ax.legend(loc=legend_loc)
|
941
1350
|
return ax
|
942
|
-
|
1351
|
+
|
1352
|
+
|
1353
|
+
# * usage: ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
|
943
1354
|
# for model_name in flatten(validation_results["roc_curve"].keys())[2:]:
|
944
1355
|
# fpr = validation_results["roc_curve"][model_name]["fpr"]
|
945
1356
|
# tpr = validation_results["roc_curve"][model_name]["tpr"]
|
@@ -950,6 +1361,7 @@ def plot_roc_curve(
|
|
950
1361
|
# ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
|
951
1362
|
# figsets(title=model_name)
|
952
1363
|
|
1364
|
+
|
953
1365
|
def plot_pr_curve(
|
954
1366
|
recall=None,
|
955
1367
|
precision=None,
|
@@ -961,21 +1373,24 @@ def plot_pr_curve(
|
|
961
1373
|
xlabel="Recall",
|
962
1374
|
ylabel="Precision",
|
963
1375
|
alpha=0.1,
|
964
|
-
color="#FF8F00",
|
1376
|
+
color="#FF8F00",
|
965
1377
|
legend_loc="lower left",
|
966
1378
|
ax=None,
|
967
|
-
**kwargs
|
1379
|
+
**kwargs,
|
968
1380
|
):
|
969
1381
|
if ax is None:
|
970
1382
|
fig, ax = plt.subplots(figsize=figsize)
|
971
|
-
|
1383
|
+
model_name = "PR curve" if model_name is None else model_name
|
972
1384
|
# Plot Precision-Recall curve
|
973
|
-
ax.plot(
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
1385
|
+
ax.plot(
|
1386
|
+
recall,
|
1387
|
+
precision,
|
1388
|
+
lw=lw,
|
1389
|
+
color=color,
|
1390
|
+
label=(f"{model_name} (AUC={avg_precision:.2f})"),
|
1391
|
+
clip_on=False,
|
1392
|
+
**kwargs,
|
1393
|
+
)
|
979
1394
|
# Fill area under the curve
|
980
1395
|
ax.fill_between(recall, precision, alpha=alpha, color=color)
|
981
1396
|
|
@@ -985,10 +1400,12 @@ def plot_pr_curve(
|
|
985
1400
|
ax.set_ylabel(ylabel)
|
986
1401
|
ax.set_xlim([-0.01, 1.0])
|
987
1402
|
ax.set_ylim([0.0, 1.0])
|
988
|
-
ax.grid(False)
|
1403
|
+
ax.grid(False)
|
989
1404
|
ax.legend(loc=legend_loc)
|
990
1405
|
return ax
|
991
|
-
|
1406
|
+
|
1407
|
+
|
1408
|
+
# * usage: ml2ls.plot_pr_curve()
|
992
1409
|
# for md_name in flatten(validation_results["pr_curve"].keys()):
|
993
1410
|
# ml2ls.plot_pr_curve(
|
994
1411
|
# recall=validation_results["pr_curve"][md_name]["recall"],
|
@@ -1000,6 +1417,7 @@ def plot_pr_curve(
|
|
1000
1417
|
# color="r",
|
1001
1418
|
# )
|
1002
1419
|
|
1420
|
+
|
1003
1421
|
def plot_cm(
|
1004
1422
|
cm,
|
1005
1423
|
labels_name=None,
|
@@ -1016,7 +1434,9 @@ def plot_cm(
|
|
1016
1434
|
if ax is None:
|
1017
1435
|
fig, ax = plt.subplots(figsize=figsize)
|
1018
1436
|
|
1019
|
-
cm_normalized = np.round(
|
1437
|
+
cm_normalized = np.round(
|
1438
|
+
cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100, 2
|
1439
|
+
)
|
1020
1440
|
cm_value = cm_normalized if normalize else cm.astype("int")
|
1021
1441
|
# Plot the heatmap
|
1022
1442
|
cax = ax.imshow(cm_normalized, interpolation="nearest", cmap=cmap)
|
@@ -1026,14 +1446,13 @@ def plot_cm(
|
|
1026
1446
|
# Define tick labels based on provided labels
|
1027
1447
|
num_local = np.arange(len(labels_name)) if labels_name is not None else range(2)
|
1028
1448
|
if axis_labels is None:
|
1029
|
-
axis_labels = labels_name if labels_name is not None else ["No","Yes"]
|
1449
|
+
axis_labels = labels_name if labels_name is not None else ["No", "Yes"]
|
1030
1450
|
ax.set_xticks(num_local)
|
1031
1451
|
ax.set_xticklabels(axis_labels)
|
1032
1452
|
ax.set_yticks(num_local)
|
1033
1453
|
ax.set_yticklabels(axis_labels)
|
1034
1454
|
ax.set_ylabel(ylabel)
|
1035
1455
|
ax.set_xlabel(xlabel)
|
1036
|
-
plot.figsets(ax=ax, xtickloc="tl", boxloc="none")
|
1037
1456
|
|
1038
1457
|
# Add TN, FP, FN, TP annotations specifically for binary classification (2x2 matrix)
|
1039
1458
|
if labels_name is None or len(labels_name) == 2:
|
@@ -1050,29 +1469,53 @@ def plot_cm(
|
|
1050
1469
|
tp_label = "TP"
|
1051
1470
|
|
1052
1471
|
# Adjust positions slightly for TN, FP, FN, TP labels
|
1053
|
-
ax.text(
|
1054
|
-
|
1472
|
+
ax.text(
|
1473
|
+
0,
|
1474
|
+
0,
|
1475
|
+
(
|
1476
|
+
f"{tn_label}:{cm_normalized[0, 0]:.2f}%"
|
1477
|
+
if normalize
|
1478
|
+
else f"{tn_label}:{cm_value[0, 0]}"
|
1479
|
+
),
|
1055
1480
|
ha="center",
|
1056
1481
|
va="center",
|
1057
1482
|
color="white" if cm_normalized[0, 0] > thresh * 100 else "black",
|
1058
1483
|
fontsize=fontsize,
|
1059
1484
|
)
|
1060
|
-
ax.text(
|
1061
|
-
|
1485
|
+
ax.text(
|
1486
|
+
1,
|
1487
|
+
0,
|
1488
|
+
(
|
1489
|
+
f"{fp_label}:{cm_normalized[0, 1]:.2f}%"
|
1490
|
+
if normalize
|
1491
|
+
else f"{fp_label}:{cm_value[0, 1]}"
|
1492
|
+
),
|
1062
1493
|
ha="center",
|
1063
1494
|
va="center",
|
1064
1495
|
color="white" if cm_normalized[0, 1] > thresh * 100 else "black",
|
1065
1496
|
fontsize=fontsize,
|
1066
1497
|
)
|
1067
|
-
ax.text(
|
1068
|
-
|
1498
|
+
ax.text(
|
1499
|
+
0,
|
1500
|
+
1,
|
1501
|
+
(
|
1502
|
+
f"{fn_label}:{cm_normalized[1, 0]:.2f}%"
|
1503
|
+
if normalize
|
1504
|
+
else f"{fn_label}:{cm_value[1, 0]}"
|
1505
|
+
),
|
1069
1506
|
ha="center",
|
1070
1507
|
va="center",
|
1071
1508
|
color="white" if cm_normalized[1, 0] > thresh * 100 else "black",
|
1072
1509
|
fontsize=fontsize,
|
1073
1510
|
)
|
1074
|
-
ax.text(
|
1075
|
-
|
1511
|
+
ax.text(
|
1512
|
+
1,
|
1513
|
+
1,
|
1514
|
+
(
|
1515
|
+
f"{tp_label}:{cm_normalized[1, 1]:.2f}%"
|
1516
|
+
if normalize
|
1517
|
+
else f"{tp_label}:{cm_value[1, 1]}"
|
1518
|
+
),
|
1076
1519
|
ha="center",
|
1077
1520
|
va="center",
|
1078
1521
|
color="white" if cm_normalized[1, 1] > thresh * 100 else "black",
|
@@ -1084,11 +1527,1054 @@ def plot_cm(
|
|
1084
1527
|
for j in range(len(labels_name)):
|
1085
1528
|
val = cm_normalized[i, j]
|
1086
1529
|
color = "white" if val > thresh * 100 else "black"
|
1087
|
-
ax.text(
|
1530
|
+
ax.text(
|
1531
|
+
j,
|
1532
|
+
i,
|
1088
1533
|
f"{val:.2f}%",
|
1089
1534
|
ha="center",
|
1090
1535
|
va="center",
|
1091
1536
|
color=color,
|
1092
1537
|
fontsize=fontsize,
|
1093
1538
|
)
|
1539
|
+
|
1540
|
+
plot.figsets(ax=ax,
|
1541
|
+
boxloc="none"
|
1542
|
+
)
|
1094
1543
|
return ax
|
1544
|
+
|
1545
|
+
def rank_models(
|
1546
|
+
cv_test_scores,
|
1547
|
+
rm_outlier=False,
|
1548
|
+
metric_weights=None,
|
1549
|
+
plot_=True,
|
1550
|
+
):
|
1551
|
+
"""
|
1552
|
+
Selects the best model based on a multi-metric scoring approach, with outlier handling, optional visualization,
|
1553
|
+
and additional performance metrics.
|
1554
|
+
|
1555
|
+
Parameters:
|
1556
|
+
- cv_test_scores (pd.DataFrame): DataFrame with cross-validation results across multiple metrics.
|
1557
|
+
Assumes columns are 'Classifier', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'.
|
1558
|
+
- metric_weights (dict): Dictionary specifying weights for each metric (e.g., {'accuracy': 0.2, 'precision': 0.3, ...}).
|
1559
|
+
If None, default weights are applied equally across available metrics.
|
1560
|
+
a. equal_weights(standard approch): 所有的metrics同等重要
|
1561
|
+
e.g., {"accuracy": 0.2, "precision": 0.2, "recall": 0.2, "f1": 0.2, "roc_auc": 0.2}
|
1562
|
+
b. accuracy_focosed: classification correctness (e.g., in balanced datasets), accuracy might be weighted more heavily.
|
1563
|
+
e.g., {"accuracy": 0.4, "precision": 0.2, "recall": 0.2, "f1": 0.1, "roc_auc": 0.1}
|
1564
|
+
c. Precision and Recall Emphasis: In cases where false positives and false negatives are particularly important (such as
|
1565
|
+
in medical applications or fraud detection), precision and recall may be weighted more heavily.
|
1566
|
+
e.g., {"accuracy": 0.2, "precision": 0.3, "recall": 0.3, "f1": 0.1, "roc_auc": 0.1}
|
1567
|
+
d. F1-Focused: When balance between precision and recall is crucial (e.g., in imbalanced datasets)
|
1568
|
+
e.g., {"accuracy": 0.2, "precision": 0.2, "recall": 0.2, "f1": 0.3, "roc_auc": 0.1}
|
1569
|
+
e. ROC-AUC Emphasis: In some cases, ROC AUC may be prioritized, particularly in classification tasks where class imbalance
|
1570
|
+
is present, as ROC AUC accounts for the model's performance across all classification thresholds.
|
1571
|
+
e.g., {"accuracy": 0.1, "precision": 0.2, "recall": 0.2, "f1": 0.3, "roc_auc": 0.3}
|
1572
|
+
|
1573
|
+
- normalize (bool): Whether to normalize scores of each metric to range [0, 1].
|
1574
|
+
- visualize (bool): If True, generates visualizations (e.g., bar plot, radar chart).
|
1575
|
+
- outlier_threshold (float): The threshold to detect outliers using the IQR method. Default is 1.5.
|
1576
|
+
- cv_folds (int): The number of cross-validation folds used.
|
1577
|
+
|
1578
|
+
Returns:
|
1579
|
+
- best_model (str): Name of the best model based on the combined metric scores.
|
1580
|
+
- scored_df (pd.DataFrame): DataFrame with an added 'combined_score' column used for model selection.
|
1581
|
+
- visualizations (dict): A dictionary containing visualizations if `visualize=True`.
|
1582
|
+
"""
|
1583
|
+
from sklearn.preprocessing import MinMaxScaler
|
1584
|
+
import seaborn as sns
|
1585
|
+
import matplotlib.pyplot as plt
|
1586
|
+
from py2ls import plot
|
1587
|
+
|
1588
|
+
# Check for missing metrics and set default weights if not provided
|
1589
|
+
available_metrics = cv_test_scores.columns[1:] # Exclude 'Classifier' column
|
1590
|
+
if metric_weights is None:
|
1591
|
+
metric_weights = {
|
1592
|
+
metric: 1 / len(available_metrics) for metric in available_metrics
|
1593
|
+
} # Equal weight if not specified
|
1594
|
+
elif metric_weights == "a":
|
1595
|
+
metric_weights = {
|
1596
|
+
"accuracy": 0.2,
|
1597
|
+
"precision": 0.2,
|
1598
|
+
"recall": 0.2,
|
1599
|
+
"f1": 0.2,
|
1600
|
+
"roc_auc": 0.2,
|
1601
|
+
}
|
1602
|
+
elif metric_weights == "b":
|
1603
|
+
metric_weights = {
|
1604
|
+
"accuracy": 0.4,
|
1605
|
+
"precision": 0.2,
|
1606
|
+
"recall": 0.2,
|
1607
|
+
"f1": 0.1,
|
1608
|
+
"roc_auc": 0.1,
|
1609
|
+
}
|
1610
|
+
elif metric_weights == "c":
|
1611
|
+
metric_weights = {
|
1612
|
+
"accuracy": 0.2,
|
1613
|
+
"precision": 0.3,
|
1614
|
+
"recall": 0.3,
|
1615
|
+
"f1": 0.1,
|
1616
|
+
"roc_auc": 0.1,
|
1617
|
+
}
|
1618
|
+
elif metric_weights == "d":
|
1619
|
+
metric_weights = {
|
1620
|
+
"accuracy": 0.2,
|
1621
|
+
"precision": 0.2,
|
1622
|
+
"recall": 0.2,
|
1623
|
+
"f1": 0.3,
|
1624
|
+
"roc_auc": 0.1,
|
1625
|
+
}
|
1626
|
+
elif metric_weights == "e":
|
1627
|
+
metric_weights = {
|
1628
|
+
"accuracy": 0.1,
|
1629
|
+
"precision": 0.2,
|
1630
|
+
"recall": 0.2,
|
1631
|
+
"f1": 0.3,
|
1632
|
+
"roc_auc": 0.3,
|
1633
|
+
}
|
1634
|
+
else:
|
1635
|
+
metric_weights = {
|
1636
|
+
metric: 1 / len(available_metrics) for metric in available_metrics
|
1637
|
+
}
|
1638
|
+
|
1639
|
+
# Normalize weights if they don’t sum to 1
|
1640
|
+
total_weight = sum(metric_weights.values())
|
1641
|
+
metric_weights = {
|
1642
|
+
metric: weight / total_weight for metric, weight in metric_weights.items()
|
1643
|
+
}
|
1644
|
+
if rm_outlier:
|
1645
|
+
cv_test_scores_ = ips.df_outlier(cv_test_scores)
|
1646
|
+
else:
|
1647
|
+
cv_test_scores_=cv_test_scores
|
1648
|
+
|
1649
|
+
# Normalize the scores of metrics if normalize is True
|
1650
|
+
scaler = MinMaxScaler()
|
1651
|
+
normalized_scores = pd.DataFrame(
|
1652
|
+
scaler.fit_transform(cv_test_scores_[available_metrics]),
|
1653
|
+
columns=available_metrics,
|
1654
|
+
)
|
1655
|
+
cv_test_scores_ = pd.concat(
|
1656
|
+
[cv_test_scores_[["Classifier"]], normalized_scores], axis=1
|
1657
|
+
)
|
1658
|
+
|
1659
|
+
# Calculate weighted scores for each model
|
1660
|
+
cv_test_scores_["combined_score"] = sum(
|
1661
|
+
cv_test_scores_[metric] * weight for metric, weight in metric_weights.items()
|
1662
|
+
)
|
1663
|
+
top_models = cv_test_scores_.sort_values(by="combined_score", ascending=False)
|
1664
|
+
cv_test_scores = cv_test_scores.loc[top_models.index]
|
1665
|
+
top_models.reset_index(drop=True, inplace=True)
|
1666
|
+
cv_test_scores.reset_index(drop=True, inplace=True)
|
1667
|
+
|
1668
|
+
if plot_:
|
1669
|
+
|
1670
|
+
def generate_bar_plot(ax, cv_test_scores):
|
1671
|
+
ax = plot.plotxy(
|
1672
|
+
y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
|
1673
|
+
)
|
1674
|
+
plt.title("Classifier Performance")
|
1675
|
+
plt.tight_layout()
|
1676
|
+
return plt
|
1677
|
+
|
1678
|
+
nexttile = plot.subplot(2, 2, figsize=[10, 7])
|
1679
|
+
generate_bar_plot(nexttile(), top_models.dropna())
|
1680
|
+
plot.radar(
|
1681
|
+
ax=nexttile(projection="polar"),
|
1682
|
+
data=cv_test_scores.set_index("Classifier"),
|
1683
|
+
ylim=[0.5, 1],
|
1684
|
+
color=plot.get_color(10),
|
1685
|
+
alpha=0.05,
|
1686
|
+
circular=1,
|
1687
|
+
)
|
1688
|
+
return cv_test_scores
|
1689
|
+
|
1690
|
+
|
1691
|
+
# # Example Usage:
|
1692
|
+
# metric_weights = {
|
1693
|
+
# "accuracy": 0.2,
|
1694
|
+
# "precision": 0.3,
|
1695
|
+
# "recall": 0.2,
|
1696
|
+
# "f1": 0.2,
|
1697
|
+
# "roc_auc": 0.1,
|
1698
|
+
# }
|
1699
|
+
# cv_test_scores = res["cv_test_scores"].copy()
|
1700
|
+
# best_model = rank_models(
|
1701
|
+
# cv_test_scores, metric_weights=metric_weights, normalize=True, plot_=True
|
1702
|
+
# )
|
1703
|
+
|
1704
|
+
# figsave("classifier_performance.pdf")
|
1705
|
+
|
1706
|
+
def predict(
|
1707
|
+
x_train: pd.DataFrame,
|
1708
|
+
y_train: pd.Series,
|
1709
|
+
x_true: pd.DataFrame=None,
|
1710
|
+
y_true: Optional[pd.Series] = None,
|
1711
|
+
common_features: set = None,
|
1712
|
+
purpose: str = "classification", # 'classification' or 'regression'
|
1713
|
+
cls: Optional[Dict[str, Any]] = None,
|
1714
|
+
metrics: Optional[List[str]] = None,
|
1715
|
+
random_state: int = 1,
|
1716
|
+
smote: bool = False,
|
1717
|
+
n_jobs:int = -1,
|
1718
|
+
plot_: bool = True,
|
1719
|
+
dir_save:str="./",
|
1720
|
+
test_size:float=0.2,# specific only when x_true is None
|
1721
|
+
cv_folds:int=5,# more cv_folds 得更加稳定,auc可能更低
|
1722
|
+
cv_level:str="l",#"s":'low',"m":'medium',"l":"high"
|
1723
|
+
class_weight: str = "balanced",
|
1724
|
+
verbose:bool=False,
|
1725
|
+
) -> pd.DataFrame:
|
1726
|
+
"""
|
1727
|
+
第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
|
1728
|
+
Usage:
|
1729
|
+
(1). predict(x_train, y_train,...) 对 x_train 进行拆分训练/测试集,并在测试集上进行验证.
|
1730
|
+
predict 函数会根据 test_size 参数,将 x_train 和 y_train 拆分出内部测试集。然后模型会在拆分出的训练集上进行训练,并在测试集上验证效果。
|
1731
|
+
(2). predict(x_train, y_train, x_true,...)使用 x_train 和 y_train 训练并对 x_true 进行预测
|
1732
|
+
由于传入了 x_true,函数会跳过 x_train 的拆分,直接使用全部的 x_train 和 y_train 进行训练。然后对 x_true 进行预测,但由于没有提供 y_true,
|
1733
|
+
因此无法与真实值进行对比。
|
1734
|
+
(3). predict(x_train, y_train, x_true, y_true,...)使用 x_train 和 y_train 训练,并验证 x_true 与真实标签 y_true.
|
1735
|
+
predict 函数会在 x_train 和 y_train 上进行训练,并将 x_true 作为测试集。由于提供了 y_true,函数可以将预测结果与 y_true 进行对比,从而
|
1736
|
+
计算验证指标,完成对 x_true 的真正验证。
|
1737
|
+
trains and validates a variety of machine learning models for both classification and regression tasks.
|
1738
|
+
It supports hyperparameter tuning with grid search and includes additional features like cross-validation,
|
1739
|
+
feature scaling, and handling of class imbalance through SMOTE.
|
1740
|
+
|
1741
|
+
Parameters:
|
1742
|
+
- x_train (pd.DataFrame):Training feature data, structured with each row as an observation and each column as a feature.
|
1743
|
+
- y_train (pd.Series):Target variable for the training dataset.
|
1744
|
+
- x_true (pd.DataFrame, optional):Test feature data. If not provided, the function splits x_train based on test_size.
|
1745
|
+
- y_true (pd.Series, optional):Test target values. If not provided, y_train is split into training and testing sets.
|
1746
|
+
- common_features (set, optional):Specifies a subset of features common across training and test data.
|
1747
|
+
- purpose (str, default = "classification"):Defines whether the task is "classification" or "regression". Determines which
|
1748
|
+
metrics and models are applied.
|
1749
|
+
- cls (dict, optional):Dictionary to specify custom classifiers/regressors. Defaults to a set of common models if not provided.
|
1750
|
+
- metrics (list, optional):List of evaluation metrics (like accuracy, F1 score) used for model evaluation.
|
1751
|
+
- random_state (int, default = 1):Random seed to ensure reproducibility.
|
1752
|
+
- smote (bool, default = False):Applies Synthetic Minority Oversampling Technique (SMOTE) to address class imbalance if enabled.
|
1753
|
+
- n_jobs (int, default = -1):Number of parallel jobs for computation. Set to -1 to use all available cores.
|
1754
|
+
- plot_ (bool, default = True):If True, generates plots of the model evaluation metrics.
|
1755
|
+
- test_size (float, default = 0.2):Test data proportion if x_true is not provided.
|
1756
|
+
- cv_folds (int, default = 5):Number of cross-validation folds.
|
1757
|
+
- cv_level (str, default = "l"):Sets the detail level of cross-validation. "s" for low, "m" for medium, and "l" for high.
|
1758
|
+
- class_weight (str, default = "balanced"):Balances class weights in classification tasks.
|
1759
|
+
- verbose (bool, default = False):If True, prints detailed output during model training.
|
1760
|
+
- dir_save (str, default = "./"):Directory path to save plot outputs and results.
|
1761
|
+
|
1762
|
+
Key Steps in the Function:
|
1763
|
+
Model Initialization: Depending on purpose, initializes either classification or regression models.
|
1764
|
+
Feature Selection: Ensures training and test sets have matching feature columns.
|
1765
|
+
SMOTE Application: Balances classes if smote is enabled and the task is classification.
|
1766
|
+
Cross-Validation and Hyperparameter Tuning: Utilizes GridSearchCV for model tuning based on cv_level.
|
1767
|
+
Evaluation and Plotting: Outputs evaluation metrics like AUC, confusion matrices, and optional plotting of performance metrics.
|
1768
|
+
"""
|
1769
|
+
from tqdm import tqdm
|
1770
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor
|
1771
|
+
from sklearn.svm import SVC, SVR
|
1772
|
+
from sklearn.tree import DecisionTreeRegressor
|
1773
|
+
from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV, LinearRegression, Lasso,RidgeClassifierCV, Perceptron, SGDClassifier
|
1774
|
+
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
|
1775
|
+
from sklearn.naive_bayes import GaussianNB,BernoulliNB
|
1776
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
1777
|
+
import xgboost as xgb
|
1778
|
+
import lightgbm as lgb
|
1779
|
+
import catboost as cb
|
1780
|
+
from sklearn.neural_network import MLPClassifier, MLPRegressor
|
1781
|
+
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
|
1782
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
|
1783
|
+
from sklearn.preprocessing import PolynomialFeatures
|
1784
|
+
|
1785
|
+
|
1786
|
+
# 拼写检查
|
1787
|
+
purpose=ips.strcmp(purpose,['classification','regression'])[0]
|
1788
|
+
print(f"{purpose} processing...")
|
1789
|
+
# Default models or regressors if not provided
|
1790
|
+
if purpose == "classification":
|
1791
|
+
model_ = {
|
1792
|
+
"Random Forest": RandomForestClassifier(random_state=random_state, class_weight=class_weight),
|
1793
|
+
|
1794
|
+
# SVC (Support Vector Classification)
|
1795
|
+
"SVM": SVC(kernel="rbf",probability=True,class_weight=class_weight,random_state=random_state),
|
1796
|
+
|
1797
|
+
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
1798
|
+
"Logistic Regression": LogisticRegression(class_weight=class_weight, random_state=random_state),
|
1799
|
+
|
1800
|
+
# Logistic Regression with L1 Regularization (Lasso)
|
1801
|
+
"Lasso Logistic Regression": LogisticRegression(penalty="l1", solver="saga", random_state=random_state),
|
1802
|
+
"Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
|
1803
|
+
"XGBoost": xgb.XGBClassifier(eval_metric="logloss",random_state=random_state,),
|
1804
|
+
"KNN": KNeighborsClassifier(n_neighbors=5),
|
1805
|
+
"Naive Bayes": GaussianNB(),
|
1806
|
+
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
1807
|
+
"AdaBoost": AdaBoostClassifier(algorithm='SAMME', random_state=random_state),
|
1808
|
+
# "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
|
1809
|
+
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
1810
|
+
"Extra Trees": ExtraTreesClassifier(random_state=random_state, class_weight=class_weight),
|
1811
|
+
"Bagging": BaggingClassifier(random_state=random_state),
|
1812
|
+
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
1813
|
+
"DecisionTree": DecisionTreeClassifier(),
|
1814
|
+
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
1815
|
+
"Ridge": RidgeClassifierCV(class_weight=class_weight, store_cv_results=True),
|
1816
|
+
"Perceptron": Perceptron(random_state=random_state),
|
1817
|
+
"Bernoulli Naive Bayes": BernoulliNB(),
|
1818
|
+
"SGDClassifier": SGDClassifier(random_state=random_state),
|
1819
|
+
}
|
1820
|
+
elif purpose == "regression":
|
1821
|
+
model_ = {
|
1822
|
+
"Random Forest": RandomForestRegressor(random_state=random_state),
|
1823
|
+
"SVM": SVR(),# SVR (Support Vector Regression)
|
1824
|
+
# "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
|
1825
|
+
"LassoCV": LassoCV(cv=cv_folds, random_state=random_state),#LassoCV自动找出最适alpha,优于Lasso
|
1826
|
+
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
1827
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse",random_state=random_state),
|
1828
|
+
"Linear Regression": LinearRegression(),
|
1829
|
+
"Lasso": Lasso(random_state=random_state),
|
1830
|
+
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
1831
|
+
# "LightGBM": lgb.LGBMRegressor(random_state=random_state),
|
1832
|
+
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
1833
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
1834
|
+
"Bagging": BaggingRegressor(random_state=random_state),
|
1835
|
+
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
1836
|
+
"ElasticNet": ElasticNet(random_state=random_state),
|
1837
|
+
"Ridge": Ridge(),
|
1838
|
+
"KNN":KNeighborsRegressor()
|
1839
|
+
}
|
1840
|
+
# indicate cls:
|
1841
|
+
if ips.run_once_within(30):# 10 min
|
1842
|
+
print(f"supported models: {list(model_.keys())}")
|
1843
|
+
if cls is None:
|
1844
|
+
models=model_
|
1845
|
+
else:
|
1846
|
+
if not isinstance(cls, list):
|
1847
|
+
cls=[cls]
|
1848
|
+
models={}
|
1849
|
+
for cls_ in cls:
|
1850
|
+
cls_ = ips.strcmp(cls_, list(model_.keys()))[0]
|
1851
|
+
models[cls_] = model_[cls_]
|
1852
|
+
if 'LightGBM' in models:
|
1853
|
+
x_train=ips.df_special_characters_cleaner(x_train)
|
1854
|
+
x_true=ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
1855
|
+
|
1856
|
+
if isinstance(y_train, str) and y_train in x_train.columns:
|
1857
|
+
y_train_col_name=y_train
|
1858
|
+
y_train=x_train[y_train]
|
1859
|
+
y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy')
|
1860
|
+
x_train = x_train.drop(y_train_col_name,axis=1)
|
1861
|
+
else:
|
1862
|
+
y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy').values.ravel()
|
1863
|
+
|
1864
|
+
if x_true is None:
|
1865
|
+
x_train, x_true, y_train, y_true = train_test_split(
|
1866
|
+
x_train,
|
1867
|
+
y_train,
|
1868
|
+
test_size=test_size,
|
1869
|
+
random_state=random_state,
|
1870
|
+
stratify=y_train if purpose == "classification" else None
|
1871
|
+
)
|
1872
|
+
if isinstance(y_train, str) and y_train in x_train.columns:
|
1873
|
+
y_train_col_name=y_train
|
1874
|
+
y_train=x_train[y_train]
|
1875
|
+
y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy')
|
1876
|
+
x_train = x_train.drop(y_train_col_name,axis=1)
|
1877
|
+
else:
|
1878
|
+
y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy').values.ravel()
|
1879
|
+
if y_true is not None:
|
1880
|
+
if isinstance(y_true, str) and y_true in x_true.columns:
|
1881
|
+
y_true_col_name=y_true
|
1882
|
+
y_true=x_true[y_true]
|
1883
|
+
y_true=ips.df_encoder(pd.DataFrame(y_true),method='dummy')
|
1884
|
+
x_true = x_true.drop(y_true_col_name,axis=1)
|
1885
|
+
else:
|
1886
|
+
y_true=ips.df_encoder(pd.DataFrame(y_true),method='dummy').values.ravel()
|
1887
|
+
|
1888
|
+
# to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
|
1889
|
+
|
1890
|
+
# y_train=y_train.values.ravel() if y_train is not None else None
|
1891
|
+
# y_true=y_true.values.ravel() if y_true is not None else None
|
1892
|
+
y_train = y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
|
1893
|
+
y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
|
1894
|
+
|
1895
|
+
|
1896
|
+
# Ensure common features are selected
|
1897
|
+
if common_features is not None:
|
1898
|
+
x_train, x_true = x_train[common_features], x_true[common_features]
|
1899
|
+
else:
|
1900
|
+
share_col_names = ips.shared(x_train.columns, x_true.columns,verbose=verbose)
|
1901
|
+
x_train, x_true =x_train[share_col_names], x_true[share_col_names]
|
1902
|
+
|
1903
|
+
x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
|
1904
|
+
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
1905
|
+
x_true, method="dummy"
|
1906
|
+
)
|
1907
|
+
|
1908
|
+
# Handle class imbalance using SMOTE (only for classification)
|
1909
|
+
if (
|
1910
|
+
smote
|
1911
|
+
and purpose == "classification"
|
1912
|
+
and y_train.value_counts(normalize=True).max() < 0.8
|
1913
|
+
):
|
1914
|
+
from imblearn.over_sampling import SMOTE
|
1915
|
+
|
1916
|
+
smote_sampler = SMOTE(random_state=random_state)
|
1917
|
+
x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
|
1918
|
+
|
1919
|
+
# Hyperparameter grids for tuning
|
1920
|
+
if cv_level in ["low",'simple','s','l']:
|
1921
|
+
param_grids = {
|
1922
|
+
"Random Forest": {
|
1923
|
+
"n_estimators": [100], # One basic option
|
1924
|
+
"max_depth": [None, 10],
|
1925
|
+
"min_samples_split": [2],
|
1926
|
+
"min_samples_leaf": [1],
|
1927
|
+
"class_weight": [None],
|
1928
|
+
} if purpose == "classification" else {
|
1929
|
+
"n_estimators": [100], # One basic option
|
1930
|
+
"max_depth": [None, 10],
|
1931
|
+
"min_samples_split": [2],
|
1932
|
+
"min_samples_leaf": [1],
|
1933
|
+
"max_features": [None],
|
1934
|
+
"bootstrap": [True], # Only one option for simplicity
|
1935
|
+
},
|
1936
|
+
"SVM": {
|
1937
|
+
"C": [1],
|
1938
|
+
"gamma": ['scale'],
|
1939
|
+
"kernel": ['rbf'],
|
1940
|
+
},
|
1941
|
+
"Lasso": {
|
1942
|
+
"alpha": [0.1],
|
1943
|
+
},
|
1944
|
+
"LassoCV": {
|
1945
|
+
"alphas": [[0.1]],
|
1946
|
+
},
|
1947
|
+
"Logistic Regression": {
|
1948
|
+
"C": [1],
|
1949
|
+
"solver": ['lbfgs'],
|
1950
|
+
"penalty": ['l2'],
|
1951
|
+
"max_iter": [500],
|
1952
|
+
},
|
1953
|
+
"Gradient Boosting": {
|
1954
|
+
"n_estimators": [100],
|
1955
|
+
"learning_rate": [0.1],
|
1956
|
+
"max_depth": [3],
|
1957
|
+
"min_samples_split": [2],
|
1958
|
+
"subsample": [0.8],
|
1959
|
+
},
|
1960
|
+
"XGBoost": {
|
1961
|
+
"n_estimators": [100],
|
1962
|
+
"max_depth": [3],
|
1963
|
+
"learning_rate": [0.1],
|
1964
|
+
"subsample": [0.8],
|
1965
|
+
"colsample_bytree": [0.8],
|
1966
|
+
},
|
1967
|
+
"KNN": {
|
1968
|
+
"n_neighbors": [3],
|
1969
|
+
"weights": ['uniform'],
|
1970
|
+
"algorithm": ['auto'],
|
1971
|
+
"p": [2],
|
1972
|
+
} if purpose == 'classification' else {
|
1973
|
+
'n_neighbors': [3],
|
1974
|
+
'weights': ['uniform'],
|
1975
|
+
'metric': ['euclidean'],
|
1976
|
+
'leaf_size': [30],
|
1977
|
+
'p': [2],
|
1978
|
+
},
|
1979
|
+
"Naive Bayes": {
|
1980
|
+
"var_smoothing": [1e-9],
|
1981
|
+
},
|
1982
|
+
"SVR": {
|
1983
|
+
"C": [1],
|
1984
|
+
"gamma": ['scale'],
|
1985
|
+
"kernel": ['rbf'],
|
1986
|
+
},
|
1987
|
+
"Linear Regression": {
|
1988
|
+
"fit_intercept": [True],
|
1989
|
+
},
|
1990
|
+
"Extra Trees": {
|
1991
|
+
"n_estimators": [100],
|
1992
|
+
"max_depth": [None, 10],
|
1993
|
+
"min_samples_split": [2],
|
1994
|
+
"min_samples_leaf": [1],
|
1995
|
+
},
|
1996
|
+
"CatBoost": {
|
1997
|
+
"iterations": [100],
|
1998
|
+
"learning_rate": [0.1],
|
1999
|
+
"depth": [3],
|
2000
|
+
"l2_leaf_reg": [1],
|
2001
|
+
},
|
2002
|
+
"LightGBM": {
|
2003
|
+
"n_estimators": [100],
|
2004
|
+
"num_leaves": [31],
|
2005
|
+
"max_depth": [10],
|
2006
|
+
'min_data_in_leaf': [20],
|
2007
|
+
'min_gain_to_split': [0.01],
|
2008
|
+
'scale_pos_weight': [10],
|
2009
|
+
},
|
2010
|
+
"Bagging": {
|
2011
|
+
"n_estimators": [50],
|
2012
|
+
"max_samples": [0.7],
|
2013
|
+
"max_features": [0.7],
|
2014
|
+
},
|
2015
|
+
"Neural Network": {
|
2016
|
+
"hidden_layer_sizes": [(50,)],
|
2017
|
+
"activation": ["relu"],
|
2018
|
+
"solver": ["adam"],
|
2019
|
+
"alpha": [0.0001],
|
2020
|
+
},
|
2021
|
+
"Decision Tree": {
|
2022
|
+
"max_depth": [None, 10],
|
2023
|
+
"min_samples_split": [2],
|
2024
|
+
"min_samples_leaf": [1],
|
2025
|
+
"criterion": ["gini"],
|
2026
|
+
},
|
2027
|
+
"AdaBoost": {
|
2028
|
+
"n_estimators": [50],
|
2029
|
+
"learning_rate": [0.5],
|
2030
|
+
},
|
2031
|
+
"Linear Discriminant Analysis": {
|
2032
|
+
"solver": ["svd"],
|
2033
|
+
"shrinkage": [None],
|
2034
|
+
},
|
2035
|
+
"Quadratic Discriminant Analysis": {
|
2036
|
+
'reg_param': [0.0],
|
2037
|
+
'priors': [None],
|
2038
|
+
'tol': [1e-4],
|
2039
|
+
},
|
2040
|
+
"Ridge": {'class_weight': [None, 'balanced']} if purpose == "classification" else {
|
2041
|
+
'alpha': [0.1, 1, 10],
|
2042
|
+
},
|
2043
|
+
"Perceptron": {
|
2044
|
+
'alpha': [1e-3],
|
2045
|
+
'penalty': ['l2'],
|
2046
|
+
'max_iter': [1000],
|
2047
|
+
'eta0': [1.0],
|
2048
|
+
},
|
2049
|
+
"Bernoulli Naive Bayes": {
|
2050
|
+
'alpha': [0.1, 1, 10],
|
2051
|
+
'binarize': [0.0],
|
2052
|
+
'fit_prior': [True],
|
2053
|
+
},
|
2054
|
+
"SGDClassifier": {
|
2055
|
+
'eta0': [0.01],
|
2056
|
+
'loss': ['hinge'],
|
2057
|
+
'penalty': ['l2'],
|
2058
|
+
'alpha': [1e-3],
|
2059
|
+
'max_iter': [1000],
|
2060
|
+
'tol': [1e-3],
|
2061
|
+
'random_state': [random_state],
|
2062
|
+
'learning_rate': ['constant'],
|
2063
|
+
},
|
2064
|
+
}
|
2065
|
+
elif cv_level in ['high','advanced','h']:
|
2066
|
+
param_grids = {
|
2067
|
+
"Random Forest": {
|
2068
|
+
"n_estimators": [100, 200, 500, 700, 1000],
|
2069
|
+
"max_depth": [None, 3, 5, 10, 15, 20, 30],
|
2070
|
+
"min_samples_split": [2, 5, 10, 20],
|
2071
|
+
"min_samples_leaf": [1, 2, 4],
|
2072
|
+
"class_weight": [None, "balanced"] if purpose == "classification" else {},
|
2073
|
+
} if purpose == "classification" else {
|
2074
|
+
"n_estimators": [100, 200, 500, 700, 1000],
|
2075
|
+
"max_depth": [None, 3, 5, 10, 15, 20, 30],
|
2076
|
+
"min_samples_split": [2, 5, 10, 20],
|
2077
|
+
"min_samples_leaf": [1, 2, 4],
|
2078
|
+
"max_features": ['auto', 'sqrt', 'log2'], # Number of features to consider when looking for the best split
|
2079
|
+
"bootstrap": [True, False], # Whether bootstrap samples are used when building trees
|
2080
|
+
},
|
2081
|
+
"SVM": {
|
2082
|
+
"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
2083
|
+
"gamma": ["scale", "auto", 0.001, 0.01, 0.1],
|
2084
|
+
"kernel": ["linear", "rbf", "poly"],
|
2085
|
+
},
|
2086
|
+
"Logistic Regression": {
|
2087
|
+
"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
2088
|
+
"solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
|
2089
|
+
"penalty": ["l1", "l2", "elasticnet"],
|
2090
|
+
"max_iter": [100, 200, 300, 500],
|
2091
|
+
},
|
2092
|
+
"Lasso":{
|
2093
|
+
"alpha": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
|
2094
|
+
"max_iter": [500, 1000, 2000, 5000],
|
2095
|
+
"tol": [1e-4, 1e-5, 1e-6],
|
2096
|
+
"selection": ["cyclic", "random"]
|
2097
|
+
},
|
2098
|
+
"LassoCV":{
|
2099
|
+
"alphas": [[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
|
2100
|
+
"max_iter": [500, 1000, 2000, 5000],
|
2101
|
+
"cv": [3, 5, 10],
|
2102
|
+
"tol": [1e-4, 1e-5, 1e-6]
|
2103
|
+
},
|
2104
|
+
"Gradient Boosting": {
|
2105
|
+
"n_estimators": [100, 200, 300, 400, 500, 700, 1000],
|
2106
|
+
"learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3, 0.5],
|
2107
|
+
"max_depth": [3, 5, 7, 9, 15],
|
2108
|
+
"min_samples_split": [2, 5, 10, 20],
|
2109
|
+
"subsample": [0.8, 1.0],
|
2110
|
+
},
|
2111
|
+
"XGBoost": {
|
2112
|
+
"n_estimators": [100, 200, 500, 700],
|
2113
|
+
"max_depth": [3, 5, 7, 10],
|
2114
|
+
"learning_rate": [0.01, 0.1, 0.2, 0.3],
|
2115
|
+
"subsample": [0.8, 1.0],
|
2116
|
+
"colsample_bytree": [0.8, 0.9, 1.0],
|
2117
|
+
},
|
2118
|
+
"KNN": {
|
2119
|
+
"n_neighbors": [1, 3, 5, 10, 15, 20],
|
2120
|
+
"weights": ["uniform", "distance"],
|
2121
|
+
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
|
2122
|
+
"p": [1, 2], # 1 for Manhattan, 2 for Euclidean distance
|
2123
|
+
} if purpose=='classification' else {
|
2124
|
+
'n_neighbors': [3, 5, 7, 9, 11], # Number of neighbors
|
2125
|
+
'weights': ['uniform', 'distance'], # Weight function used in prediction
|
2126
|
+
'metric': ['euclidean', 'manhattan', 'minkowski'], # Distance metric
|
2127
|
+
'leaf_size': [20, 30, 40, 50], # Leaf size for KDTree or BallTree algorithms
|
2128
|
+
'p': [1, 2] # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
|
2129
|
+
},
|
2130
|
+
"Naive Bayes": {
|
2131
|
+
"var_smoothing": [1e-10, 1e-9, 1e-8, 1e-7],
|
2132
|
+
},
|
2133
|
+
"AdaBoost": {
|
2134
|
+
"n_estimators": [50, 100, 200, 300, 500],
|
2135
|
+
"learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
|
2136
|
+
},
|
2137
|
+
"SVR": {
|
2138
|
+
"C": [0.01, 0.1, 1, 10, 100, 1000],
|
2139
|
+
"gamma": [0.001, 0.01, 0.1, "scale", "auto"],
|
2140
|
+
"kernel": ["linear", "rbf", "poly"],
|
2141
|
+
},
|
2142
|
+
"Linear Regression": {
|
2143
|
+
"fit_intercept": [True, False],
|
2144
|
+
},
|
2145
|
+
"Lasso":{
|
2146
|
+
"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
|
2147
|
+
"max_iter": [1000, 2000] # Higher iteration limit for fine-tuning
|
2148
|
+
},
|
2149
|
+
"Extra Trees": {
|
2150
|
+
"n_estimators": [100, 200, 500, 700, 1000],
|
2151
|
+
"max_depth": [None, 5, 10, 15, 20, 30],
|
2152
|
+
"min_samples_split": [2, 5, 10, 20],
|
2153
|
+
"min_samples_leaf": [1, 2, 4]
|
2154
|
+
},
|
2155
|
+
"CatBoost": {
|
2156
|
+
"iterations": [100, 200, 500],
|
2157
|
+
"learning_rate": [0.001, 0.01, 0.1, 0.2],
|
2158
|
+
"depth": [3, 5, 7, 10],
|
2159
|
+
"l2_leaf_reg": [1, 3, 5, 7, 10],
|
2160
|
+
"border_count": [32, 64, 128],
|
2161
|
+
},
|
2162
|
+
"LightGBM": {
|
2163
|
+
"n_estimators": [100, 200, 500, 700, 1000],
|
2164
|
+
"learning_rate": [0.001, 0.01, 0.1, 0.2],
|
2165
|
+
"num_leaves": [31, 50, 100, 200],
|
2166
|
+
"max_depth": [-1, 5, 10, 20, 30],
|
2167
|
+
"min_child_samples": [5, 10, 20],
|
2168
|
+
"subsample": [0.8, 1.0],
|
2169
|
+
"colsample_bytree": [0.8, 0.9, 1.0],
|
2170
|
+
},
|
2171
|
+
"Neural Network": {
|
2172
|
+
"hidden_layer_sizes": [(50,), (100,), (100, 50), (200, 100)],
|
2173
|
+
"activation": ["relu", "tanh", "logistic"],
|
2174
|
+
"solver": ["adam", "sgd", "lbfgs"],
|
2175
|
+
"alpha": [0.0001, 0.001, 0.01],
|
2176
|
+
"learning_rate": ["constant", "adaptive"],
|
2177
|
+
},
|
2178
|
+
"Decision Tree": {
|
2179
|
+
"max_depth": [None, 5, 10, 20, 30],
|
2180
|
+
"min_samples_split": [2, 5, 10, 20],
|
2181
|
+
"min_samples_leaf": [1, 2, 5, 10],
|
2182
|
+
"criterion": ["gini", "entropy"],
|
2183
|
+
"splitter": ["best", "random"],
|
2184
|
+
},
|
2185
|
+
"Linear Discriminant Analysis": {
|
2186
|
+
"solver": ["svd", "lsqr", "eigen"],
|
2187
|
+
"shrinkage": [None, "auto", 0.1, 0.5, 1.0], # shrinkage levels for 'lsqr' and 'eigen'
|
2188
|
+
},
|
2189
|
+
'Ridge': {'class_weight': [None, 'balanced']} if purpose == "classification" else {
|
2190
|
+
'alpha': [0.1, 1, 10, 100, 1000],
|
2191
|
+
'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'lbfgs'],
|
2192
|
+
'fit_intercept': [True, False], # Whether to calculate the intercept
|
2193
|
+
'normalize': [True, False] # If True, the regressors X will be normalized
|
2194
|
+
}
|
2195
|
+
}
|
2196
|
+
else: # median level
|
2197
|
+
param_grids = {
|
2198
|
+
"Random Forest": {
|
2199
|
+
"n_estimators": [100, 200, 500],
|
2200
|
+
"max_depth": [None, 10, 20, 30],
|
2201
|
+
"min_samples_split": [2, 5, 10],
|
2202
|
+
"min_samples_leaf": [1, 2, 4],
|
2203
|
+
"class_weight": [None, "balanced"]
|
2204
|
+
} if purpose == "classification" else {
|
2205
|
+
"n_estimators": [100, 200, 500],
|
2206
|
+
"max_depth": [None, 10, 20, 30],
|
2207
|
+
"min_samples_split": [2, 5, 10],
|
2208
|
+
"min_samples_leaf": [1, 2, 4],
|
2209
|
+
"max_features": ['auto', 'sqrt', 'log2'], # Number of features to consider when looking for the best split
|
2210
|
+
"bootstrap": [True, False], # Whether bootstrap samples are used when building trees
|
2211
|
+
},
|
2212
|
+
"SVM": {
|
2213
|
+
"C": [0.1, 1, 10, 100], # Regularization strength
|
2214
|
+
"gamma": ['scale', 'auto'], # Common gamma values
|
2215
|
+
"kernel": ['rbf', 'linear', 'poly'],
|
2216
|
+
},
|
2217
|
+
"Logistic Regression": {
|
2218
|
+
"C": [0.1, 1, 10, 100], # Regularization strength
|
2219
|
+
"solver": ['lbfgs', 'liblinear', 'saga'], # Common solvers
|
2220
|
+
"penalty": ['l2'], # L2 penalty is most common
|
2221
|
+
"max_iter": [500, 1000, 2000], # Increased max_iter for better convergence
|
2222
|
+
},
|
2223
|
+
"Lasso":{
|
2224
|
+
"alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
|
2225
|
+
"max_iter": [500, 1000, 2000]
|
2226
|
+
},
|
2227
|
+
"LassoCV":{
|
2228
|
+
"alphas": [[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
|
2229
|
+
"max_iter": [500, 1000, 2000]
|
2230
|
+
},
|
2231
|
+
"Gradient Boosting": {
|
2232
|
+
"n_estimators": [100, 200, 500],
|
2233
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
2234
|
+
"max_depth": [3, 5, 7],
|
2235
|
+
"min_samples_split": [2, 5, 10],
|
2236
|
+
"subsample": [0.8, 1.0],
|
2237
|
+
},
|
2238
|
+
"XGBoost": {
|
2239
|
+
"n_estimators": [100, 200, 500],
|
2240
|
+
"max_depth": [3, 5, 7],
|
2241
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
2242
|
+
"subsample": [0.8, 1.0],
|
2243
|
+
"colsample_bytree": [0.8, 1.0],
|
2244
|
+
},
|
2245
|
+
"KNN": {
|
2246
|
+
"n_neighbors": [3, 5, 7, 10],
|
2247
|
+
"weights": ['uniform', 'distance'],
|
2248
|
+
"algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
|
2249
|
+
"p": [1, 2],
|
2250
|
+
} if purpose=='classification' else {
|
2251
|
+
'n_neighbors': [3, 5, 7, 9, 11], # Number of neighbors
|
2252
|
+
'weights': ['uniform', 'distance'], # Weight function used in prediction
|
2253
|
+
'metric': ['euclidean', 'manhattan', 'minkowski'], # Distance metric
|
2254
|
+
'leaf_size': [20, 30, 40, 50], # Leaf size for KDTree or BallTree algorithms
|
2255
|
+
'p': [1, 2] # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
|
2256
|
+
},
|
2257
|
+
"Naive Bayes": {
|
2258
|
+
"var_smoothing": [1e-9, 1e-8, 1e-7],
|
2259
|
+
},
|
2260
|
+
"SVR": {
|
2261
|
+
"C": [0.1, 1, 10, 100],
|
2262
|
+
"gamma": ['scale', 'auto'],
|
2263
|
+
"kernel": ['rbf', 'linear'],
|
2264
|
+
},
|
2265
|
+
"Linear Regression": {
|
2266
|
+
"fit_intercept": [True, False],
|
2267
|
+
},
|
2268
|
+
"Lasso": {
|
2269
|
+
"alpha": [0.1, 1.0, 10.0],
|
2270
|
+
"max_iter": [1000, 2000], # Sufficient iterations for convergence
|
2271
|
+
},
|
2272
|
+
"Extra Trees": {
|
2273
|
+
"n_estimators": [100, 200, 500],
|
2274
|
+
"max_depth": [None, 10, 20, 30],
|
2275
|
+
"min_samples_split": [2, 5, 10],
|
2276
|
+
"min_samples_leaf": [1, 2, 4],
|
2277
|
+
},
|
2278
|
+
"CatBoost": {
|
2279
|
+
"iterations": [100, 200],
|
2280
|
+
"learning_rate": [0.01, 0.1],
|
2281
|
+
"depth": [3, 6, 10],
|
2282
|
+
"l2_leaf_reg": [1, 3, 5, 7],
|
2283
|
+
},
|
2284
|
+
"LightGBM": {
|
2285
|
+
"n_estimators": [100, 200, 500],
|
2286
|
+
"learning_rate": [0.01, 0.1],
|
2287
|
+
"num_leaves": [31, 50, 100],
|
2288
|
+
"max_depth": [-1, 10, 20],
|
2289
|
+
'min_data_in_leaf': [20], # Minimum samples in each leaf
|
2290
|
+
'min_gain_to_split': [0.01], # Minimum gain to allow a split
|
2291
|
+
'scale_pos_weight': [10], # Address class imbalance
|
2292
|
+
},
|
2293
|
+
"Bagging": {
|
2294
|
+
"n_estimators": [10, 50, 100],
|
2295
|
+
"max_samples": [0.5, 0.7, 1.0],
|
2296
|
+
"max_features": [0.5, 0.7, 1.0],
|
2297
|
+
},
|
2298
|
+
"Neural Network": {
|
2299
|
+
"hidden_layer_sizes": [(50,), (100,), (100, 50)],
|
2300
|
+
"activation": ["relu", "tanh"],
|
2301
|
+
"solver": ["adam", "sgd"],
|
2302
|
+
"alpha": [0.0001, 0.001],
|
2303
|
+
},
|
2304
|
+
"Decision Tree": {
|
2305
|
+
"max_depth": [None, 10, 20],
|
2306
|
+
"min_samples_split": [2, 10],
|
2307
|
+
"min_samples_leaf": [1, 4],
|
2308
|
+
"criterion": ["gini", "entropy"],
|
2309
|
+
},
|
2310
|
+
"AdaBoost": {
|
2311
|
+
"n_estimators": [50, 100],
|
2312
|
+
"learning_rate": [0.5, 1.0],
|
2313
|
+
},
|
2314
|
+
"Linear Discriminant Analysis": {
|
2315
|
+
"solver": ["svd", "lsqr", "eigen"],
|
2316
|
+
"shrinkage": [None, "auto"],
|
2317
|
+
}, "Quadratic Discriminant Analysis":{
|
2318
|
+
'reg_param': [0.0, 0.1, 0.5, 1.0], # Regularization parameter
|
2319
|
+
'priors': [None, [0.5, 0.5], [0.3, 0.7]], # Class priors
|
2320
|
+
'tol': [1e-4, 1e-3, 1e-2] # Tolerance value for the convergence of the algorithm
|
2321
|
+
},
|
2322
|
+
"Perceptron":{
|
2323
|
+
'alpha': [1e-4, 1e-3, 1e-2], # Regularization parameter
|
2324
|
+
'penalty': ['l2', 'l1', 'elasticnet'], # Regularization penalty
|
2325
|
+
'max_iter': [1000, 2000], # Maximum number of iterations
|
2326
|
+
'eta0': [1.0, 0.1], # Learning rate for gradient descent
|
2327
|
+
'tol': [1e-3, 1e-4, 1e-5], # Tolerance for stopping criteria
|
2328
|
+
'random_state': [random_state] # Random state for reproducibility
|
2329
|
+
},
|
2330
|
+
"Bernoulli Naive Bayes":{
|
2331
|
+
'alpha': [0.1, 1.0, 10.0], # Additive (Laplace) smoothing parameter
|
2332
|
+
'binarize': [0.0, 0.5, 1.0], # Threshold for binarizing the input features
|
2333
|
+
'fit_prior': [True, False] # Whether to learn class prior probabilities
|
2334
|
+
},
|
2335
|
+
"SGDClassifier":{
|
2336
|
+
'eta0': [0.01, 0.1, 1.0],
|
2337
|
+
'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], # Loss function
|
2338
|
+
'penalty': ['l2', 'l1', 'elasticnet'], # Regularization penalty
|
2339
|
+
'alpha': [1e-4, 1e-3, 1e-2], # Regularization strength
|
2340
|
+
'l1_ratio': [0.15, 0.5, 0.85], # L1 ratio for elasticnet penalty
|
2341
|
+
'max_iter': [1000, 2000], # Maximum number of iterations
|
2342
|
+
'tol': [1e-3, 1e-4], # Tolerance for stopping criteria
|
2343
|
+
'random_state': [random_state], # Random state for reproducibility
|
2344
|
+
'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], # Learning rate schedule
|
2345
|
+
},
|
2346
|
+
'Ridge': {'class_weight': [None, 'balanced']} if purpose == "classification" else {
|
2347
|
+
'alpha': [0.1, 1, 10, 100],
|
2348
|
+
'solver': ['auto', 'svd', 'cholesky', 'lsqr'] # Solver for optimization
|
2349
|
+
}
|
2350
|
+
}
|
2351
|
+
|
2352
|
+
results = {}
|
2353
|
+
# Use StratifiedKFold for classification and KFold for regression
|
2354
|
+
cv = (
|
2355
|
+
StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
2356
|
+
if purpose == "classification"
|
2357
|
+
else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
2358
|
+
)
|
2359
|
+
|
2360
|
+
# Train and validate each model
|
2361
|
+
for name, clf in tqdm(
|
2362
|
+
models.items(),
|
2363
|
+
desc="models",
|
2364
|
+
colour="green",
|
2365
|
+
bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
|
2366
|
+
):
|
2367
|
+
if verbose:
|
2368
|
+
print(f"\nTraining and validating {name}:")
|
2369
|
+
|
2370
|
+
# Grid search with KFold or StratifiedKFold
|
2371
|
+
gs = GridSearchCV(
|
2372
|
+
clf,
|
2373
|
+
param_grid=param_grids.get(name, {}),
|
2374
|
+
scoring=(
|
2375
|
+
"roc_auc" if purpose == "classification" else "neg_mean_squared_error"
|
2376
|
+
),
|
2377
|
+
cv=cv,
|
2378
|
+
n_jobs=n_jobs,
|
2379
|
+
verbose=verbose,
|
2380
|
+
)
|
2381
|
+
gs.fit(x_train, y_train)
|
2382
|
+
best_clf = gs.best_estimator_
|
2383
|
+
# make sure x_train and x_test has the same name
|
2384
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2385
|
+
y_pred = best_clf.predict(x_true)
|
2386
|
+
|
2387
|
+
# y_pred_proba
|
2388
|
+
if hasattr(best_clf, "predict_proba"):
|
2389
|
+
y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
|
2390
|
+
elif hasattr(best_clf, "decision_function"):
|
2391
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
2392
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
2393
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
2394
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
2395
|
+
y_pred_proba.max() - y_pred_proba.min()
|
2396
|
+
)
|
2397
|
+
else:
|
2398
|
+
y_pred_proba = None # No probability output for certain models
|
2399
|
+
|
2400
|
+
|
2401
|
+
validation_scores = {}
|
2402
|
+
if y_true is not None:
|
2403
|
+
validation_scores = cal_metrics(y_true, y_pred, y_pred_proba=y_pred_proba, purpose=purpose, average="weighted")
|
2404
|
+
|
2405
|
+
# Calculate ROC curve
|
2406
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
2407
|
+
if y_pred_proba is not None:
|
2408
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2409
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2410
|
+
lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba,verbose=False)
|
2411
|
+
roc_auc = auc(fpr, tpr)
|
2412
|
+
roc_info = {
|
2413
|
+
"fpr": fpr.tolist(),
|
2414
|
+
"tpr": tpr.tolist(),
|
2415
|
+
"auc": roc_auc,
|
2416
|
+
"ci95": (lower_ci, upper_ci),
|
2417
|
+
}
|
2418
|
+
# precision-recall curve
|
2419
|
+
precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
|
2420
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
2421
|
+
pr_info = {
|
2422
|
+
"precision": precision_,
|
2423
|
+
"recall": recall_,
|
2424
|
+
"avg_precision": avg_precision_,
|
2425
|
+
}
|
2426
|
+
else:
|
2427
|
+
roc_info, pr_info = None, None
|
2428
|
+
if purpose=="classification":
|
2429
|
+
results[name] = {
|
2430
|
+
"best_clf": gs.best_estimator_,
|
2431
|
+
"best_params": gs.best_params_,
|
2432
|
+
"auc_indiv":[gs.cv_results_[f'split{i}_test_score'][gs.best_index_] for i in range(cv_folds)],
|
2433
|
+
"scores": validation_scores,
|
2434
|
+
"roc_curve": roc_info,
|
2435
|
+
"pr_curve": pr_info,
|
2436
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2437
|
+
"predictions": y_pred.tolist(),
|
2438
|
+
"predictions_proba": (
|
2439
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2440
|
+
),
|
2441
|
+
}
|
2442
|
+
else: # "regression"
|
2443
|
+
results[name] = {
|
2444
|
+
"best_clf": gs.best_estimator_,
|
2445
|
+
"best_params": gs.best_params_,
|
2446
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2447
|
+
"predictions": y_pred.tolist(),
|
2448
|
+
"predictions_proba": (
|
2449
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2450
|
+
),
|
2451
|
+
}
|
2452
|
+
|
2453
|
+
else:
|
2454
|
+
results[name] = {
|
2455
|
+
"best_clf": gs.best_estimator_,
|
2456
|
+
"best_params": gs.best_params_,
|
2457
|
+
"scores": validation_scores,
|
2458
|
+
"predictions": y_pred.tolist(),
|
2459
|
+
"predictions_proba": (
|
2460
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2461
|
+
),
|
2462
|
+
}
|
2463
|
+
|
2464
|
+
# Convert results to DataFrame
|
2465
|
+
df_results = pd.DataFrame.from_dict(results, orient="index")
|
2466
|
+
|
2467
|
+
# sort
|
2468
|
+
if y_true is not None and purpose=="classification":
|
2469
|
+
df_scores = pd.DataFrame(
|
2470
|
+
df_results["scores"].tolist(), index=df_results["scores"].index
|
2471
|
+
).sort_values(by="roc_auc", ascending=False)
|
2472
|
+
df_results=df_results.loc[df_scores.index]
|
2473
|
+
|
2474
|
+
if plot_:
|
2475
|
+
from datetime import datetime
|
2476
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
2477
|
+
nexttile=plot.subplot(figsize=[12, 10])
|
2478
|
+
plot.heatmap(df_scores, kind="direct",ax=nexttile())
|
2479
|
+
plot.figsets(xangle=30)
|
2480
|
+
if dir_save:
|
2481
|
+
ips.figsave(dir_save+f"scores_sorted_heatmap{now_}.pdf")
|
2482
|
+
if df_scores.shape[0]>1:# draw cluster
|
2483
|
+
plot.heatmap(df_scores, kind="direct",cluster=True)
|
2484
|
+
plot.figsets(xangle=30)
|
2485
|
+
if dir_save:
|
2486
|
+
ips.figsave(dir_save+f"scores_clus{now_}.pdf")
|
2487
|
+
if all([plot_, y_true is not None, purpose=='classification']):
|
2488
|
+
try:
|
2489
|
+
if len(models)>3:
|
2490
|
+
plot_validate_features(df_results)
|
2491
|
+
else:
|
2492
|
+
plot_validate_features_single(df_results,figsize=(12,4*len(models)))
|
2493
|
+
if dir_save:
|
2494
|
+
ips.figsave(dir_save+f"validate_features{now_}.pdf")
|
2495
|
+
except Exception as e:
|
2496
|
+
print(f"Error: 在画图的过程中出现了问题:{e}")
|
2497
|
+
return df_results
|
2498
|
+
|
2499
|
+
|
2500
|
+
def cal_metrics(y_true, y_pred, y_pred_proba=None, purpose="regression", average="weighted"):
|
2501
|
+
"""
|
2502
|
+
Calculate regression or classification metrics based on the purpose.
|
2503
|
+
|
2504
|
+
Parameters:
|
2505
|
+
- y_true: Array of true values.
|
2506
|
+
- y_pred: Array of predicted labels for classification or predicted values for regression.
|
2507
|
+
- y_pred_proba: Array of predicted probabilities for classification (optional).
|
2508
|
+
- purpose: str, "regression" or "classification".
|
2509
|
+
- average: str, averaging method for multi-class classification ("binary", "micro", "macro", "weighted", etc.).
|
2510
|
+
|
2511
|
+
Returns:
|
2512
|
+
- validation_scores: dict of computed metrics.
|
2513
|
+
"""
|
2514
|
+
from sklearn.metrics import (
|
2515
|
+
mean_squared_error,
|
2516
|
+
mean_absolute_error,
|
2517
|
+
mean_absolute_percentage_error,
|
2518
|
+
explained_variance_score,
|
2519
|
+
r2_score,
|
2520
|
+
mean_squared_log_error,
|
2521
|
+
accuracy_score,
|
2522
|
+
precision_score,
|
2523
|
+
recall_score,
|
2524
|
+
f1_score,
|
2525
|
+
roc_auc_score,
|
2526
|
+
matthews_corrcoef,
|
2527
|
+
confusion_matrix,
|
2528
|
+
balanced_accuracy_score,
|
2529
|
+
average_precision_score,
|
2530
|
+
precision_recall_curve
|
2531
|
+
)
|
2532
|
+
validation_scores = {}
|
2533
|
+
|
2534
|
+
if purpose == "regression":
|
2535
|
+
y_true = np.asarray(y_true)
|
2536
|
+
y_true = y_true.ravel()
|
2537
|
+
y_pred = np.asarray(y_pred)
|
2538
|
+
y_pred = y_pred.ravel()
|
2539
|
+
# Regression metrics
|
2540
|
+
validation_scores = {
|
2541
|
+
"mse": mean_squared_error(y_true, y_pred),
|
2542
|
+
"rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
|
2543
|
+
"mae": mean_absolute_error(y_true, y_pred),
|
2544
|
+
"r2": r2_score(y_true, y_pred),
|
2545
|
+
"mape": mean_absolute_percentage_error(y_true, y_pred),
|
2546
|
+
"explained_variance": explained_variance_score(y_true, y_pred),
|
2547
|
+
"mbd": np.mean(y_pred - y_true) # Mean Bias Deviation
|
2548
|
+
}
|
2549
|
+
# Check if MSLE can be calculated
|
2550
|
+
if np.all(y_true >= 0) and np.all(y_pred >= 0): # Ensure no negative values
|
2551
|
+
validation_scores["msle"] = mean_squared_log_error(y_true, y_pred)
|
2552
|
+
else:
|
2553
|
+
validation_scores["msle"] = "Cannot be calculated due to negative values"
|
2554
|
+
|
2555
|
+
elif purpose == "classification":
|
2556
|
+
# Classification metrics
|
2557
|
+
validation_scores = {
|
2558
|
+
"accuracy": accuracy_score(y_true, y_pred),
|
2559
|
+
"precision": precision_score(y_true, y_pred, average=average),
|
2560
|
+
"recall": recall_score(y_true, y_pred, average=average),
|
2561
|
+
"f1": f1_score(y_true, y_pred, average=average),
|
2562
|
+
"mcc": matthews_corrcoef(y_true, y_pred),
|
2563
|
+
"specificity": None,
|
2564
|
+
"balanced_accuracy": balanced_accuracy_score(y_true, y_pred)
|
2565
|
+
}
|
2566
|
+
|
2567
|
+
# Confusion matrix to calculate specificity
|
2568
|
+
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
|
2569
|
+
validation_scores["specificity"] = tn / (tn + fp) if (tn + fp) > 0 else 0 # Specificity calculation
|
2570
|
+
|
2571
|
+
if y_pred_proba is not None:
|
2572
|
+
# Calculate ROC-AUC
|
2573
|
+
validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
|
2574
|
+
# PR-AUC (Precision-Recall AUC) calculation
|
2575
|
+
validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
|
2576
|
+
else:
|
2577
|
+
raise ValueError("Invalid purpose specified. Choose 'regression' or 'classification'.")
|
2578
|
+
|
2579
|
+
return validation_scores
|
2580
|
+
|