py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -1,33 +1,59 @@
1
- from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,BaggingClassifier
2
- from sklearn.svm import SVC
1
+ from sklearn.ensemble import (
2
+ RandomForestClassifier,
3
+ GradientBoostingClassifier,
4
+ AdaBoostClassifier,
5
+ BaggingClassifier,
6
+ )
7
+ from sklearn.svm import SVC,SVR
3
8
  from sklearn.calibration import CalibratedClassifierCV
4
- from sklearn.model_selection import GridSearchCV,StratifiedKFold
5
- from sklearn.linear_model import LassoCV, LogisticRegression, Lasso, Ridge,RidgeClassifierCV, ElasticNet
9
+ from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
+ from sklearn.linear_model import (
11
+ LassoCV,
12
+ LogisticRegression,LinearRegression,
13
+ Lasso,
14
+ Ridge,
15
+ RidgeClassifierCV,
16
+ ElasticNet,
17
+ )
6
18
  from sklearn.feature_selection import RFE
7
19
  from sklearn.naive_bayes import GaussianNB
8
20
  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
9
21
  import xgboost as xgb # Make sure you have xgboost installed
10
22
 
11
23
  from sklearn.model_selection import train_test_split, cross_val_score
12
- from sklearn.metrics import (accuracy_score, precision_score, recall_score,
13
- f1_score, roc_auc_score, confusion_matrix,
14
- matthews_corrcoef,roc_curve,auc,
15
- balanced_accuracy_score,precision_recall_curve,average_precision_score)
24
+ from sklearn.metrics import (
25
+ accuracy_score,
26
+ precision_score,
27
+ recall_score,
28
+ f1_score,
29
+ roc_auc_score,
30
+ confusion_matrix,
31
+ matthews_corrcoef,
32
+ roc_curve,
33
+ auc,
34
+ balanced_accuracy_score,
35
+ precision_recall_curve,
36
+ average_precision_score,
37
+ )
16
38
  from imblearn.over_sampling import SMOTE
17
39
  from sklearn.pipeline import Pipeline
18
40
  from collections import defaultdict
19
- from sklearn.preprocessing import StandardScaler
20
- from typing import Dict, Any, Optional,List
41
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
42
+ from typing import Dict, Any, Optional, List, Union
21
43
  import numpy as np
22
44
  import pandas as pd
23
- from . import ips
45
+ from . import ips
24
46
  from . import plot
25
47
  import matplotlib.pyplot as plt
26
48
  import seaborn as sns
27
- plt.style.use("paper")
49
+
50
+ plt.style.use("paper")
28
51
  import logging
29
52
  import warnings
30
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
53
+
54
+ logging.basicConfig(
55
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
56
+ )
31
57
  logger = logging.getLogger()
32
58
 
33
59
  # Ignore specific warnings (UserWarning in this case)
@@ -35,7 +61,10 @@ warnings.filterwarnings("ignore", category=UserWarning)
35
61
  from sklearn.tree import DecisionTreeClassifier
36
62
  from sklearn.neighbors import KNeighborsClassifier
37
63
 
38
- def features_knn(X_train: pd.DataFrame, y_train: pd.Series, knn_params: dict) -> pd.DataFrame:
64
+
65
+ def features_knn(
66
+ x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
67
+ ) -> pd.DataFrame:
39
68
  """
40
69
  A distance-based classifier that assigns labels based on the majority label of nearest neighbors.
41
70
  when to use:
@@ -46,76 +75,99 @@ def features_knn(X_train: pd.DataFrame, y_train: pd.Series, knn_params: dict) ->
46
75
  Fits KNeighborsClassifier and approximates feature influence using permutation importance.
47
76
  """
48
77
  knn = KNeighborsClassifier(**knn_params)
49
- knn.fit(X_train, y_train)
50
- importances = permutation_importance(knn, X_train, y_train, n_repeats=30, random_state=1, scoring="accuracy")
51
- return pd.DataFrame({"feature": X_train.columns, "importance": importances.importances_mean}).sort_values(by="importance", ascending=False)
78
+ knn.fit(x_train, y_train)
79
+ importances = permutation_importance(
80
+ knn, x_train, y_train, n_repeats=30, random_state=1, scoring="accuracy"
81
+ )
82
+ return pd.DataFrame(
83
+ {"feature": x_train.columns, "importance": importances.importances_mean}
84
+ ).sort_values(by="importance", ascending=False)
85
+
52
86
 
53
87
  #! 1. Linear and Regularized Regression Methods
54
88
  # 1.1 Lasso
55
- def features_lasso(X_train: pd.DataFrame, y_train: pd.Series, lasso_params: dict) -> np.ndarray:
89
+ def features_lasso(
90
+ x_train: pd.DataFrame, y_train: pd.Series, lasso_params: dict
91
+ ) -> np.ndarray:
56
92
  """
57
- Lasso (Least Absolute Shrinkage and Selection Operator):
58
- A regularized linear regression method that uses L1 penalty to shrink coefficients, effectively
93
+ Lasso (Least Absolute Shrinkage and Selection Operator):
94
+ A regularized linear regression method that uses L1 penalty to shrink coefficients, effectively
59
95
  performing feature selection by zeroing out less important ones.
60
96
  """
61
97
  lasso = LassoCV(**lasso_params)
62
- lasso.fit(X_train, y_train)
98
+ lasso.fit(x_train, y_train)
63
99
  # Get non-zero coefficients and their corresponding features
64
100
  coefficients = lasso.coef_
65
- importance_df = pd.DataFrame({
66
- "feature": X_train.columns,
67
- "importance": np.abs(coefficients)
68
- })
69
- return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
101
+ importance_df = pd.DataFrame(
102
+ {"feature": x_train.columns, "importance": np.abs(coefficients)}
103
+ )
104
+ return importance_df[importance_df["importance"] > 0].sort_values(
105
+ by="importance", ascending=False
106
+ )
107
+
70
108
 
71
109
  # 1.2 Ridge regression
72
- def features_ridge(X_train: pd.DataFrame, y_train: pd.Series, ridge_params: dict) -> np.ndarray:
110
+ def features_ridge(
111
+ x_train: pd.DataFrame, y_train: pd.Series, ridge_params: dict
112
+ ) -> np.ndarray:
73
113
  """
74
- Ridge Regression: A linear regression technique that applies L2 regularization, reducing coefficient
114
+ Ridge Regression: A linear regression technique that applies L2 regularization, reducing coefficient
75
115
  magnitudes to avoid overfitting, especially with multicollinearity among features.
76
116
  """
77
117
  from sklearn.linear_model import RidgeCV
118
+
78
119
  ridge = RidgeCV(**ridge_params)
79
- ridge.fit(X_train, y_train)
80
-
120
+ ridge.fit(x_train, y_train)
121
+
81
122
  # Get the coefficients
82
123
  coefficients = ridge.coef_
83
-
124
+
84
125
  # Create a DataFrame to hold feature importance
85
- importance_df = pd.DataFrame({
86
- "feature": X_train.columns,
87
- "importance": np.abs(coefficients)
88
- })
89
- return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
126
+ importance_df = pd.DataFrame(
127
+ {"feature": x_train.columns, "importance": np.abs(coefficients)}
128
+ )
129
+ return importance_df[importance_df["importance"] > 0].sort_values(
130
+ by="importance", ascending=False
131
+ )
132
+
90
133
 
91
134
  # 1.3 Elastic Net(Enet)
92
- def features_enet(X_train: pd.DataFrame, y_train: pd.Series, enet_params: dict) -> np.ndarray:
135
+ def features_enet(
136
+ x_train: pd.DataFrame, y_train: pd.Series, enet_params: dict
137
+ ) -> np.ndarray:
93
138
  """
94
- Elastic Net (Enet): Combines L1 and L2 penalties (lasso and ridge) in a linear model, beneficial
139
+ Elastic Net (Enet): Combines L1 and L2 penalties (lasso and ridge) in a linear model, beneficial
95
140
  when features are highly correlated or for datasets with more features than samples.
96
141
  """
97
142
  from sklearn.linear_model import ElasticNetCV
143
+
98
144
  enet = ElasticNetCV(**enet_params)
99
- enet.fit(X_train, y_train)
145
+ enet.fit(x_train, y_train)
100
146
  # Get the coefficients
101
147
  coefficients = enet.coef_
102
148
  # Create a DataFrame to hold feature importance
103
- importance_df = pd.DataFrame({
104
- "feature": X_train.columns,
105
- "importance": np.abs(coefficients)
106
- })
107
- return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
108
- # 1.4 Partial Least Squares Regression for Generalized Linear Models (plsRglm): Combines regression and
149
+ importance_df = pd.DataFrame(
150
+ {"feature": x_train.columns, "importance": np.abs(coefficients)}
151
+ )
152
+ return importance_df[importance_df["importance"] > 0].sort_values(
153
+ by="importance", ascending=False
154
+ )
155
+
156
+
157
+ # 1.4 Partial Least Squares Regression for Generalized Linear Models (plsRglm): Combines regression and
109
158
  # feature reduction, useful for high-dimensional data with correlated features, such as genomics.
110
159
 
111
160
  #! 2.Generalized Linear Models and Extensions
112
- # 2.1
161
+ # 2.1
162
+
113
163
 
114
164
  #!3.Tree-Based and Ensemble Methods
115
165
  # 3.1 Random Forest(RF)
116
- def features_rf(X_train: pd.DataFrame, y_train: pd.Series, rf_params: dict) -> np.ndarray:
166
+ def features_rf(
167
+ x_train: pd.DataFrame, y_train: pd.Series, rf_params: dict
168
+ ) -> np.ndarray:
117
169
  """
118
- An ensemble of decision trees that combines predictions from multiple trees for classification or
170
+ An ensemble of decision trees that combines predictions from multiple trees for classification or
119
171
  regression, effective with high-dimensional, complex datasets.
120
172
  when to use:
121
173
  Handles high-dimensional data well.
@@ -125,36 +177,55 @@ def features_rf(X_train: pd.DataFrame, y_train: pd.Series, rf_params: dict) -> n
125
177
  Recommended Use: Great for classification problems, especially when you have many features (genes).
126
178
  """
127
179
  rf = RandomForestClassifier(**rf_params)
128
- rf.fit(X_train, y_train)
129
- return pd.DataFrame({"feature": X_train.columns, "importance": rf.featuress_}).sort_values(by="importance", ascending=False)
180
+ rf.fit(x_train, y_train)
181
+ return pd.DataFrame(
182
+ {"feature": x_train.columns, "importance": rf.featuress_}
183
+ ).sort_values(by="importance", ascending=False)
184
+
185
+
130
186
  # 3.2 Gradient Boosting Trees
131
- def features_gradient_boosting(X_train: pd.DataFrame, y_train: pd.Series, gb_params: dict) -> pd.DataFrame:
187
+ def features_gradient_boosting(
188
+ x_train: pd.DataFrame, y_train: pd.Series, gb_params: dict
189
+ ) -> pd.DataFrame:
132
190
  """
133
- An ensemble of decision trees that combines predictions from multiple trees for classification or regression, effective with
191
+ An ensemble of decision trees that combines predictions from multiple trees for classification or regression, effective with
134
192
  high-dimensional, complex datasets.
135
193
  Gradient Boosting
136
194
  Strengths:
137
195
  High predictive accuracy and works well for both classification and regression.
138
196
  Can handle a mixture of numerical and categorical features.
139
- Recommended Use:
197
+ Recommended Use:
140
198
  Effective for complex relationships and when you need a powerful predictive model.
141
199
  Fit Gradient Boosting classifier and return sorted feature importances.
142
200
  Recommended Use: Effective for complex datasets with many features (genes).
143
201
  """
144
202
  gb = GradientBoostingClassifier(**gb_params)
145
- gb.fit(X_train, y_train)
146
- return pd.DataFrame({"feature": X_train.columns, "importance": gb.feature_importances_}).sort_values(by="importance", ascending=False)
203
+ gb.fit(x_train, y_train)
204
+ return pd.DataFrame(
205
+ {"feature": x_train.columns, "importance": gb.feature_importances_}
206
+ ).sort_values(by="importance", ascending=False)
207
+
208
+
147
209
  # 3.3 XGBoost
148
- def features_xgb(X_train: pd.DataFrame, y_train: pd.Series, xgb_params: dict) -> pd.DataFrame:
210
+ def features_xgb(
211
+ x_train: pd.DataFrame, y_train: pd.Series, xgb_params: dict
212
+ ) -> pd.DataFrame:
149
213
  """
150
214
  XGBoost: An advanced gradient boosting technique, faster and more efficient than GBM, with excellent predictive performance on structured data.
151
215
  """
152
216
  import xgboost as xgb
217
+
153
218
  xgb_model = xgb.XGBClassifier(**xgb_params)
154
- xgb_model.fit(X_train, y_train)
155
- return pd.DataFrame({"feature": X_train.columns, "importance": xgb_model.feature_importances_}).sort_values(by="importance", ascending=False)
219
+ xgb_model.fit(x_train, y_train)
220
+ return pd.DataFrame(
221
+ {"feature": x_train.columns, "importance": xgb_model.feature_importances_}
222
+ ).sort_values(by="importance", ascending=False)
223
+
224
+
156
225
  # 3.4.decision tree
157
- def features_decision_tree(X_train: pd.DataFrame, y_train: pd.Series, dt_params: dict) -> pd.DataFrame:
226
+ def features_decision_tree(
227
+ x_train: pd.DataFrame, y_train: pd.Series, dt_params: dict
228
+ ) -> pd.DataFrame:
158
229
  """
159
230
  A single decision tree classifier effective for identifying key decision boundaries in data.
160
231
  when to use:
@@ -162,58 +233,76 @@ def features_decision_tree(X_train: pd.DataFrame, y_train: pd.Series, dt_params:
162
233
  Provides feature importance scores for each feature, though it may overfit on small datasets.
163
234
  Efficient for low to medium-sized datasets, where interpretability of decisions is key.
164
235
  Recommended Use: Useful for interpretable feature importance analysis in smaller or balanced datasets.
165
-
236
+
166
237
  Fits DecisionTreeClassifier and returns sorted feature importances.
167
238
  """
168
239
  dt = DecisionTreeClassifier(**dt_params)
169
- dt.fit(X_train, y_train)
170
- return pd.DataFrame({"feature": X_train.columns, "importance": dt.feature_importances_}).sort_values(by="importance", ascending=False)
240
+ dt.fit(x_train, y_train)
241
+ return pd.DataFrame(
242
+ {"feature": x_train.columns, "importance": dt.feature_importances_}
243
+ ).sort_values(by="importance", ascending=False)
244
+
245
+
171
246
  # 3.5 bagging
172
- def features_bagging(X_train: pd.DataFrame, y_train: pd.Series, bagging_params: dict) -> pd.DataFrame:
247
+ def features_bagging(
248
+ x_train: pd.DataFrame, y_train: pd.Series, bagging_params: dict
249
+ ) -> pd.DataFrame:
173
250
  """
174
- A bagging ensemble of classifiers, often used with weak learners like decision trees, to reduce variance.
251
+ A bagging ensemble of models, often used with weak learners like decision trees, to reduce variance.
175
252
  when to use:
176
253
  Helps reduce overfitting, especially on high-variance models.
177
254
  Effective when the dataset has numerous features and may benefit from ensemble stability.
178
255
  Recommended Use: Beneficial for high-dimensional or noisy datasets needing ensemble stability.
179
-
256
+
180
257
  Fits BaggingClassifier and returns averaged feature importances from underlying estimators if available.
181
258
  """
182
259
  bagging = BaggingClassifier(**bagging_params)
183
- bagging.fit(X_train, y_train)
184
-
260
+ bagging.fit(x_train, y_train)
261
+
185
262
  # Calculate feature importance by averaging importances across estimators, if feature_importances_ is available.
186
263
  if hasattr(bagging.estimators_[0], "feature_importances_"):
187
- importances = np.mean([estimator.feature_importances_ for estimator in bagging.estimators_], axis=0)
188
- return pd.DataFrame({"feature": X_train.columns, "importance": importances}).sort_values(by="importance", ascending=False)
264
+ importances = np.mean(
265
+ [estimator.feature_importances_ for estimator in bagging.estimators_],
266
+ axis=0,
267
+ )
268
+ return pd.DataFrame(
269
+ {"feature": x_train.columns, "importance": importances}
270
+ ).sort_values(by="importance", ascending=False)
189
271
  else:
190
272
  # If the base estimator does not support feature importances, fallback to permutation importance.
191
- importances = permutation_importance(bagging, X_train, y_train, n_repeats=30, random_state=1, scoring="accuracy")
192
- return pd.DataFrame({"feature": X_train.columns, "importance": importances.importances_mean}).sort_values(by="importance", ascending=False)
273
+ importances = permutation_importance(
274
+ bagging, x_train, y_train, n_repeats=30, random_state=1, scoring="accuracy"
275
+ )
276
+ return pd.DataFrame(
277
+ {"feature": x_train.columns, "importance": importances.importances_mean}
278
+ ).sort_values(by="importance", ascending=False)
279
+
193
280
 
194
281
  #! 4.Support Vector Machines
195
- def features_svm(X_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict) -> np.ndarray:
282
+ def features_svm(
283
+ x_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict
284
+ ) -> np.ndarray:
196
285
  """
197
286
  Suitable for classification tasks where the number of features is much larger than the number of samples.
198
287
  1. Effective in high-dimensional spaces and with clear margin of separation.
199
288
  2. Works well for both linear and non-linear classification (using kernel functions).
200
- Select features using RFE with SVM.When combined with SVM, RFE selects features that are most critical for the decision boundary,
289
+ Select features using RFE with SVM.When combined with SVM, RFE selects features that are most critical for the decision boundary,
201
290
  helping reduce the dataset to a more manageable size without losing much predictive power.
202
- SVM (Support Vector Machines),supports various kernels (linear, rbf, poly, and sigmoid), is good at handling high-dimensional
291
+ SVM (Support Vector Machines),supports various kernels (linear, rbf, poly, and sigmoid), is good at handling high-dimensional
203
292
  data and finding an optimal decision boundary between classes, especially when using the right kernel.
204
293
  kernel: ["linear", "rbf", "poly", "sigmoid"]
205
- 'linear': simplest kernel that attempts to separate data by drawing a straight line (or hyperplane) between classes. It is effective
294
+ 'linear': simplest kernel that attempts to separate data by drawing a straight line (or hyperplane) between classes. It is effective
206
295
  when the data is linearly separable, meaning the classes can be well divided by a straight boundary.
207
296
  Advantages:
208
297
  - Computationally efficient for large datasets.
209
- - Works well when the number of features is high, which is common in genomic data where you may have thousands of genes
298
+ - Works well when the number of features is high, which is common in genomic data where you may have thousands of genes
210
299
  as features.
211
- 'rbf': a nonlinear kernel that maps the input data into a higher-dimensional space to find a decision boundary. It works well for
300
+ 'rbf': a nonlinear kernel that maps the input data into a higher-dimensional space to find a decision boundary. It works well for
212
301
  data that is not linearly separable in its original space.
213
- Advantages:
302
+ Advantages:
214
303
  - Handles nonlinear relationships between features and classes
215
304
  - Often better than a linear kernel when there is no clear linear decision boundary in the data.
216
- 'poly': Polynomial Kernel: computes similarity between data points based on polynomial functions of the input features. It can model
305
+ 'poly': Polynomial Kernel: computes similarity between data points based on polynomial functions of the input features. It can model
217
306
  interactions between features to a certain degree, depending on the polynomial degree chosen.
218
307
  Advantages:
219
308
  - Allows modeling of feature interactions.
@@ -221,58 +310,80 @@ def features_svm(X_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict) ->
221
310
  'sigmoid': similar to the activation function in neural networks, and it works well when the data follows an S-shaped decision boundary.
222
311
  Advantages:
223
312
  - Can approximate the behavior of neural networks.
224
- - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
313
+ - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
225
314
  S-shaped relationships.
226
315
  """
227
316
  # SVM (Support Vector Machines)
228
- svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
317
+ svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
229
318
  # RFE(Recursive Feature Elimination)
230
319
  selector = RFE(svc, n_features_to_select=rfe_params["n_features_to_select"])
231
- selector.fit(X_train, y_train)
232
- return X_train.columns[selector.support_]
320
+ selector.fit(x_train, y_train)
321
+ return x_train.columns[selector.support_]
322
+
323
+
233
324
  #! 5.Bayesian and Probabilistic Methods
234
- def features_naive_bayes(X_train: pd.DataFrame, y_train: pd.Series) -> list:
325
+ def features_naive_bayes(x_train: pd.DataFrame, y_train: pd.Series) -> list:
235
326
  """
236
- Naive Bayes: A probabilistic classifier based on Bayes' theorem, assuming independence between features, simple and fast, especially
327
+ Naive Bayes: A probabilistic classifier based on Bayes' theorem, assuming independence between features, simple and fast, especially
237
328
  effective for text classification and other high-dimensional data.
238
329
  """
239
330
  from sklearn.naive_bayes import GaussianNB
331
+
240
332
  nb = GaussianNB()
241
- nb.fit(X_train, y_train)
242
- probabilities = nb.predict_proba(X_train)
243
- return X_train.columns[np.argsort(probabilities.max(axis=1))[:X_train.shape[1] // 2]]
333
+ nb.fit(x_train, y_train)
334
+ probabilities = nb.predict_proba(x_train)
335
+ # Limit the number of features safely, choosing the lesser of half the features or all columns
336
+ n_features = min(x_train.shape[1] // 2, len(x_train.columns))
337
+
338
+ # Sort probabilities, then map to valid column indices
339
+ sorted_indices = np.argsort(probabilities.max(axis=1))[:n_features]
340
+
341
+ # Ensure indices are within the column bounds of x_train
342
+ valid_indices = sorted_indices[sorted_indices < len(x_train.columns)]
343
+
344
+ return x_train.columns[valid_indices]
345
+
346
+
244
347
  #! 6.Linear Discriminant Analysis (LDA)
245
- def features_lda(X_train: pd.DataFrame, y_train: pd.Series) -> list:
348
+ def features_lda(x_train: pd.DataFrame, y_train: pd.Series) -> list:
246
349
  """
247
- Linear Discriminant Analysis (LDA): Projects data onto a lower-dimensional space to maximize class separability, often used as a dimensionality
350
+ Linear Discriminant Analysis (LDA): Projects data onto a lower-dimensional space to maximize class separability, often used as a dimensionality
248
351
  reduction technique before classification on high-dimensional data.
249
352
  """
250
353
  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
354
+
251
355
  lda = LinearDiscriminantAnalysis()
252
- lda.fit(X_train, y_train)
356
+ lda.fit(x_train, y_train)
253
357
  coef = lda.coef_.flatten()
254
358
  # Create a DataFrame to hold feature importance
255
- importance_df = pd.DataFrame({
256
- "feature": X_train.columns,
257
- "importance": np.abs(coef)
258
- })
259
-
260
- return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
359
+ importance_df = pd.DataFrame(
360
+ {"feature": x_train.columns, "importance": np.abs(coef)}
361
+ )
261
362
 
262
- def features_adaboost(X_train: pd.DataFrame, y_train: pd.Series, adaboost_params: dict) -> pd.DataFrame:
363
+ return importance_df[importance_df["importance"] > 0].sort_values(
364
+ by="importance", ascending=False
365
+ )
366
+
367
+
368
+ def features_adaboost(
369
+ x_train: pd.DataFrame, y_train: pd.Series, adaboost_params: dict
370
+ ) -> pd.DataFrame:
263
371
  """
264
372
  AdaBoost
265
373
  Strengths:
266
374
  Combines multiple weak learners to create a strong classifier.
267
375
  Focuses on examples that are hard to classify, improving overall performance.
268
- Recommended Use:
269
- Can be effective for boosting weak classifiers in a genomics context.
376
+ Recommended Use:
377
+ Can be effective for boosting weak models in a genomics context.
270
378
  Fit AdaBoost classifier and return sorted feature importances.
271
379
  Recommended Use: Great for classification problems with a large number of features (genes).
272
380
  """
273
381
  ada = AdaBoostClassifier(**adaboost_params)
274
- ada.fit(X_train, y_train)
275
- return pd.DataFrame({"feature": X_train.columns, "importance": ada.feature_importances_}).sort_values(by="importance", ascending=False)
382
+ ada.fit(x_train, y_train)
383
+ return pd.DataFrame(
384
+ {"feature": x_train.columns, "importance": ada.feature_importances_}
385
+ ).sort_values(by="importance", ascending=False)
386
+
276
387
 
277
388
  import torch
278
389
  import torch.nn as nn
@@ -280,32 +391,30 @@ import torch.optim as optim
280
391
  from torch.utils.data import DataLoader, TensorDataset
281
392
  from skorch import NeuralNetClassifier # sklearn compatible
282
393
 
394
+
283
395
  class DNNClassifier(nn.Module):
284
396
  def __init__(self, input_dim, hidden_dim=128, output_dim=2, dropout_rate=0.5):
285
397
  super(DNNClassifier, self).__init__()
286
-
398
+
287
399
  self.hidden_layer1 = nn.Sequential(
288
400
  nn.Linear(input_dim, hidden_dim),
289
401
  nn.ReLU(),
290
402
  nn.Dropout(dropout_rate),
291
403
  nn.Linear(hidden_dim, hidden_dim),
292
- nn.ReLU()
404
+ nn.ReLU(),
293
405
  )
294
-
406
+
295
407
  self.hidden_layer2 = nn.Sequential(
296
- nn.Linear(hidden_dim, hidden_dim),
297
- nn.ReLU(),
298
- nn.Dropout(dropout_rate)
408
+ nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(dropout_rate)
299
409
  )
300
-
410
+
301
411
  # Adding a residual connection between hidden layers
302
412
  self.residual = nn.Linear(input_dim, hidden_dim)
303
-
413
+
304
414
  self.output_layer = nn.Sequential(
305
- nn.Linear(hidden_dim, output_dim),
306
- nn.Softmax(dim=1)
415
+ nn.Linear(hidden_dim, output_dim), nn.Softmax(dim=1)
307
416
  )
308
-
417
+
309
418
  def forward(self, x):
310
419
  residual = self.residual(x)
311
420
  x = self.hidden_layer1(x)
@@ -314,64 +423,77 @@ class DNNClassifier(nn.Module):
314
423
  x = self.output_layer(x)
315
424
  return x
316
425
 
317
- def validate_classifier(clf, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, metrics: list=["accuracy", "precision", "recall", "f1", "roc_auc"] , cv_folds: int=5) -> dict:
426
+
427
+ def validate_classifier(
428
+ clf,
429
+ x_train: pd.DataFrame,
430
+ y_train: pd.Series,
431
+ x_test: pd.DataFrame,
432
+ y_test: pd.Series,
433
+ metrics: list = ["accuracy", "precision", "recall", "f1", "roc_auc"],
434
+ cv_folds: int = 5,
435
+ ) -> dict:
318
436
  """
319
437
  Perform cross-validation for a given classifier and return average scores for specified metrics on training data.
320
438
  Then fit the best model on the full training data and evaluate it on the test set.
321
-
439
+
322
440
  Parameters:
323
441
  - clf: The classifier to be validated.
324
- - X_train: Training features.
442
+ - x_train: Training features.
325
443
  - y_train: Training labels.
326
- - X_test: Test features.
444
+ - x_test: Test features.
327
445
  - y_test: Test labels.
328
446
  - metrics: List of metrics to evaluate (e.g., ['accuracy', 'roc_auc']).
329
447
  - cv_folds: Number of cross-validation folds.
330
-
448
+
331
449
  Returns:
332
450
  - results: Dictionary containing average cv_train_scores and cv_test_scores.
333
451
  """
334
452
  cv_train_scores = {metric: [] for metric in metrics}
335
453
  skf = StratifiedKFold(n_splits=cv_folds)
336
- # Perform cross-validation
454
+ # Perform cross-validation
337
455
  for metric in metrics:
338
456
  try:
339
457
  if metric == "roc_auc" and len(set(y_train)) == 2:
340
- scores = cross_val_score(clf, X_train, y_train, cv=skf, scoring="roc_auc")
341
- cv_train_scores[metric] = np.nanmean(scores) if not np.isnan(scores).all() else float('nan')
458
+ scores = cross_val_score(
459
+ clf, x_train, y_train, cv=skf, scoring="roc_auc"
460
+ )
461
+ cv_train_scores[metric] = (
462
+ np.nanmean(scores) if not np.isnan(scores).all() else float("nan")
463
+ )
342
464
  else:
343
- score = cross_val_score(clf, X_train, y_train, cv=skf, scoring=metric)
465
+ score = cross_val_score(clf, x_train, y_train, cv=skf, scoring=metric)
344
466
  cv_train_scores[metric] = score.mean()
345
467
  except Exception as e:
346
- cv_train_scores[metric] = float('nan')
347
- clf.fit(X_train, y_train)
348
-
468
+ cv_train_scores[metric] = float("nan")
469
+ clf.fit(x_train, y_train)
470
+
349
471
  # Evaluate on the test set
350
472
  cv_test_scores = {}
351
473
  for metric in metrics:
352
474
  if metric == "roc_auc" and len(set(y_test)) == 2:
353
475
  try:
354
- y_prob=clf.predict_proba(X_test)[:, 1]
355
- cv_test_scores[metric] = roc_auc_score(y_test,y_prob)
476
+ y_prob = clf.predict_proba(x_test)[:, 1]
477
+ cv_test_scores[metric] = roc_auc_score(y_test, y_prob)
356
478
  except AttributeError:
357
- cv_test_scores[metric]=float('nan')
479
+ cv_test_scores[metric] = float("nan")
358
480
  else:
359
- score_func = globals().get(f'{metric}_score') # Fetching the appropriate scoring function
481
+ score_func = globals().get(
482
+ f"{metric}_score"
483
+ ) # Fetching the appropriate scoring function
360
484
  if score_func:
361
485
  try:
362
- y_pred = clf.predict(X_test)
486
+ y_pred = clf.predict(x_test)
363
487
  cv_test_scores[metric] = score_func(y_test, y_pred)
364
488
  except Exception as e:
365
- cv_test_scores[metric] = float('nan')
489
+ cv_test_scores[metric] = float("nan")
366
490
 
367
491
  # Combine results
368
- results = {
369
- 'cv_train_scores': cv_train_scores,
370
- 'cv_test_scores': cv_test_scores
371
- }
492
+ results = {"cv_train_scores": cv_train_scores, "cv_test_scores": cv_test_scores}
372
493
  return results
373
494
 
374
- def get_classifiers(
495
+
496
+ def get_models(
375
497
  random_state=1,
376
498
  cls=[
377
499
  "lasso",
@@ -383,25 +505,36 @@ def get_classifiers(
383
505
  "Support Vector Machine(svm)",
384
506
  "naive bayes",
385
507
  "Linear Discriminant Analysis (lda)",
386
- "adaboost","DecisionTree","KNeighbors","Bagging"
508
+ "adaboost",
509
+ "DecisionTree",
510
+ "KNeighbors",
511
+ "Bagging",
387
512
  ],
388
513
  ):
389
514
  from sklearn.ensemble import (
390
515
  RandomForestClassifier,
391
516
  GradientBoostingClassifier,
392
517
  AdaBoostClassifier,
393
- BaggingClassifier
518
+ BaggingClassifier,
394
519
  )
395
520
  from sklearn.svm import SVC
396
- from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifierCV, ElasticNet
521
+ from sklearn.linear_model import (
522
+ LogisticRegression,
523
+ Lasso,
524
+ RidgeClassifierCV,
525
+ ElasticNet,
526
+ )
397
527
  from sklearn.naive_bayes import GaussianNB
398
528
  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
399
529
  import xgboost as xgb
400
530
  from sklearn.tree import DecisionTreeClassifier
401
531
  from sklearn.neighbors import KNeighborsClassifier
532
+
402
533
  res_cls = {}
403
- classifiers_all = {
404
- "Lasso": LogisticRegression(penalty='l1', solver='saga', random_state=random_state),
534
+ model_all = {
535
+ "Lasso": LogisticRegression(
536
+ penalty="l1", solver="saga", random_state=random_state
537
+ ),
405
538
  "Ridge": RidgeClassifierCV(),
406
539
  "Elastic Net (Enet)": ElasticNet(random_state=random_state),
407
540
  "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
@@ -411,23 +544,25 @@ def get_classifiers(
411
544
  "Naive Bayes": GaussianNB(),
412
545
  "Linear Discriminant Analysis (LDA)": LinearDiscriminantAnalysis(),
413
546
  "AdaBoost": AdaBoostClassifier(random_state=random_state, algorithm="SAMME"),
414
- "DecisionTree":DecisionTreeClassifier(),
547
+ "DecisionTree": DecisionTreeClassifier(),
415
548
  "KNeighbors": KNeighborsClassifier(n_neighbors=5),
416
549
  "Bagging": BaggingClassifier(),
417
550
  }
418
- print("Using default classifiers:")
551
+ print("Using default models:")
419
552
  for cls_name in cls:
420
- cls_name = ips.strcmp(cls_name, list(classifiers_all.keys()))[0]
421
- res_cls[cls_name] = classifiers_all[cls_name]
553
+ cls_name = ips.strcmp(cls_name, list(model_all.keys()))[0]
554
+ res_cls[cls_name] = model_all[cls_name]
422
555
  print(f"- {cls_name}")
423
556
  return res_cls
424
557
 
558
+
425
559
  def get_features(
426
- X: pd.DataFrame,
427
- y: pd.Series,
560
+ X: Union[pd.DataFrame, np.ndarray], # n_samples X n_features
561
+ y: Union[pd.Series, np.ndarray, list], # n_samples X n_features
428
562
  test_size: float = 0.2,
429
563
  random_state: int = 1,
430
564
  n_features: int = 10,
565
+ fill_missing=True,
431
566
  rf_params: Optional[Dict] = None,
432
567
  rfe_params: Optional[Dict] = None,
433
568
  lasso_params: Optional[Dict] = None,
@@ -439,169 +574,338 @@ def get_features(
439
574
  dt_params: Optional[Dict] = None,
440
575
  bagging_params: Optional[Dict] = None,
441
576
  knn_params: Optional[Dict] = None,
442
- cls: list=[
443
- "lasso",
444
- "ridge",
445
- "Elastic Net(Enet)",
446
- "gradient Boosting",
447
- "Random forest (rf)",
448
- "XGBoost (xgb)",
449
- "Support Vector Machine(svm)",
450
- "naive bayes",
451
- "Linear Discriminant Analysis (lda)",
452
- "adaboost","DecisionTree","KNeighbors","Bagging"
453
- ],
577
+ cls: list = [
578
+ "lasso","ridge","Elastic Net(Enet)","gradient Boosting","Random forest (rf)","XGBoost (xgb)","Support Vector Machine(svm)",
579
+ "naive bayes","Linear Discriminant Analysis (lda)","adaboost","DecisionTree","KNeighbors","Bagging"],
454
580
  metrics: Optional[List[str]] = None,
455
581
  cv_folds: int = 5,
456
- strict:bool=False,
457
- n_shared:int=2, # 只要有两个方法有重合,就纳入common genes
582
+ strict: bool = False,
583
+ n_shared: int = 2, # 只要有两个方法有重合,就纳入common genes
458
584
  use_selected_features: bool = True,
459
- ) -> dict:
585
+ plot_: bool = True,
586
+ dir_save:str="./") -> dict:
460
587
  """
461
- Master function to perform feature selection and validate classifiers.
588
+ Master function to perform feature selection and validate models.
462
589
  """
590
+ from sklearn.compose import ColumnTransformer
591
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
592
+
593
+ # Ensure X and y are DataFrames/Series for consistency
594
+ if isinstance(X, np.ndarray):
595
+ X = pd.DataFrame(X)
596
+ if isinstance(y, (np.ndarray, list)):
597
+ y = pd.Series(y)
598
+
599
+ # fill na
600
+ if fill_missing:
601
+ ips.df_fillna(data=X,method='knn',inplace=True,axis=0)
602
+ if isinstance(y, str) and y in X.columns:
603
+ y_col_name=y
604
+ y=X[y]
605
+ y=ips.df_encoder(pd.DataFrame(y),method='dummy')
606
+ X = X.drop(y_col_name,axis=1)
607
+ else:
608
+ y=ips.df_encoder(pd.DataFrame(y),method='dummy').values.ravel()
609
+ y = y.loc[X.index] # Align y with X after dropping rows with missing values in X
610
+ y = y.ravel() if isinstance(y, np.ndarray) else y.values.ravel()
611
+
612
+ if X.shape[0] != len(y):
613
+ raise ValueError("X and y must have the same number of samples (rows).")
614
+
615
+ # #! # Check for non-numeric columns in X and apply one-hot encoding if needed
616
+ # Check if any column in X is non-numeric
617
+ if any(not np.issubdtype(dtype, np.number) for dtype in X.dtypes):
618
+ X = pd.get_dummies(X, drop_first=True)
619
+ print(X.shape)
620
+
621
+ # #!alternative: # Identify categorical and numerical columns
622
+ # categorical_cols = X.select_dtypes(include=["object", "category"]).columns
623
+ # numerical_cols = X.select_dtypes(include=["number"]).columns
624
+
625
+ # # Define preprocessing pipeline
626
+ # preprocessor = ColumnTransformer(
627
+ # transformers=[
628
+ # ("num", StandardScaler(), numerical_cols),
629
+ # ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
630
+ # ]
631
+ # )
632
+ # # Preprocess the data
633
+ # X = preprocessor.fit_transform(X)
634
+
463
635
  # Split data into training and test sets
464
- X_train, X_test, y_train, y_test = train_test_split(
636
+ x_train, x_test, y_train, y_test = train_test_split(
465
637
  X, y, test_size=test_size, random_state=random_state
466
638
  )
467
639
  # Standardize features
468
640
  scaler = StandardScaler()
469
- X_train_scaled = scaler.fit_transform(X_train)
470
- X_test_scaled = scaler.transform(X_test)
471
-
641
+ x_train_scaled = scaler.fit_transform(x_train)
642
+ x_test_scaled = scaler.transform(x_test)
643
+
472
644
  # Convert back to DataFrame for consistency
473
- X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
474
- X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)
645
+ x_train = pd.DataFrame(x_train_scaled, columns=x_train.columns)
646
+ x_test = pd.DataFrame(x_test_scaled, columns=x_test.columns)
475
647
 
476
648
  rf_defaults = {"n_estimators": 100, "random_state": random_state}
477
649
  rfe_defaults = {"kernel": "linear", "n_features_to_select": n_features}
478
650
  lasso_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
479
651
  ridge_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
480
652
  enet_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
481
- xgb_defaults = {"n_estimators": 100, "use_label_encoder": False, "eval_metric": "logloss", "random_state": random_state}
653
+ xgb_defaults = {
654
+ "n_estimators": 100,
655
+ "use_label_encoder": False,
656
+ "eval_metric": "logloss",
657
+ "random_state": random_state,
658
+ }
482
659
  gb_defaults = {"n_estimators": 100, "random_state": random_state}
483
660
  adaboost_defaults = {"n_estimators": 50, "random_state": random_state}
484
661
  dt_defaults = {"max_depth": None, "random_state": random_state}
485
662
  bagging_defaults = {"n_estimators": 50, "random_state": random_state}
486
663
  knn_defaults = {"n_neighbors": 5}
487
664
  rf_params, rfe_params = rf_params or rf_defaults, rfe_params or rfe_defaults
488
- lasso_params, ridge_params = lasso_params or lasso_defaults, ridge_params or ridge_defaults
665
+ lasso_params, ridge_params = (
666
+ lasso_params or lasso_defaults,
667
+ ridge_params or ridge_defaults,
668
+ )
489
669
  enet_params, xgb_params = enet_params or enet_defaults, xgb_params or xgb_defaults
490
- gb_params, adaboost_params = gb_params or gb_defaults, adaboost_params or adaboost_defaults
670
+ gb_params, adaboost_params = (
671
+ gb_params or gb_defaults,
672
+ adaboost_params or adaboost_defaults,
673
+ )
491
674
  dt_params = dt_params or dt_defaults
492
675
  bagging_params = bagging_params or bagging_defaults
493
676
  knn_params = knn_params or knn_defaults
494
677
 
495
- cls_ = ["lasso",'ridge','Elastic Net(Enet)',"Gradient Boosting","Random Forest (rf)",
496
- 'XGBoost (xgb)','Support Vector Machine(svm)','Naive Bayes','Linear Discriminant Analysis (lda)','adaboost']
497
- cls=[ips.strcmp(i,cls_)[0] for i in cls]
678
+ cls_ = [
679
+ "lasso",
680
+ "ridge",
681
+ "Elastic Net(Enet)",
682
+ "Gradient Boosting",
683
+ "Random Forest (rf)",
684
+ "XGBoost (xgb)",
685
+ "Support Vector Machine(svm)",
686
+ "Naive Bayes",
687
+ "Linear Discriminant Analysis (lda)",
688
+ "adaboost",
689
+ ]
690
+ cls = [ips.strcmp(i, cls_)[0] for i in cls]
498
691
 
499
692
  # Lasso Feature Selection
500
- lasso_importances = features_lasso(X_train, y_train, lasso_params) if 'lasso'in cls else pd.DataFrame()
501
- lasso_selected_features= lasso_importances.head(n_features)["feature"].values if 'lasso'in cls else []
502
- # Ridge
503
- ridge_importances=features_ridge(X_train, y_train,ridge_params) if 'ridge'in cls else pd.DataFrame()
504
- selected_ridge_features= ridge_importances.head(n_features)["feature"].values if 'ridge'in cls else []
693
+ lasso_importances = (
694
+ features_lasso(x_train, y_train, lasso_params)
695
+ if "lasso" in cls
696
+ else pd.DataFrame()
697
+ )
698
+ lasso_selected_features = (
699
+ lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
700
+ )
701
+ # Ridge
702
+ ridge_importances = (
703
+ features_ridge(x_train, y_train, ridge_params)
704
+ if "ridge" in cls
705
+ else pd.DataFrame()
706
+ )
707
+ selected_ridge_features = (
708
+ ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
709
+ )
505
710
  # Elastic Net
506
- enet_importances=features_enet(X_train, y_train,enet_params) if 'Enet'in cls else pd.DataFrame()
507
- selected_enet_features= enet_importances.head(n_features)["feature"].values if 'Enet'in cls else []
508
- # Random Forest Feature Importance
509
- rf_importances = features_rf(X_train, y_train, rf_params) if 'Random Forest'in cls else pd.DataFrame()
510
- top_rf_features = rf_importances.head(n_features)["feature"].values if 'Random Forest'in cls else []
511
- # Gradient Boosting Feature Importance
512
- gb_importances = features_gradient_boosting(X_train, y_train, gb_params) if 'Gradient Boosting'in cls else pd.DataFrame()
513
- top_gb_features = gb_importances.head(n_features)["feature"].values if 'Gradient Boosting'in cls else []
711
+ enet_importances = (
712
+ features_enet(x_train, y_train, enet_params)
713
+ if "Enet" in cls
714
+ else pd.DataFrame()
715
+ )
716
+ selected_enet_features = (
717
+ enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
718
+ )
719
+ # Random Forest Feature Importance
720
+ rf_importances = (
721
+ features_rf(x_train, y_train, rf_params)
722
+ if "Random Forest" in cls
723
+ else pd.DataFrame()
724
+ )
725
+ top_rf_features = (
726
+ rf_importances.head(n_features)["feature"].values
727
+ if "Random Forest" in cls
728
+ else []
729
+ )
730
+ # Gradient Boosting Feature Importance
731
+ gb_importances = (
732
+ features_gradient_boosting(x_train, y_train, gb_params)
733
+ if "Gradient Boosting" in cls
734
+ else pd.DataFrame()
735
+ )
736
+ top_gb_features = (
737
+ gb_importances.head(n_features)["feature"].values
738
+ if "Gradient Boosting" in cls
739
+ else []
740
+ )
514
741
  # xgb
515
- xgb_importances = features_xgb(X_train, y_train,xgb_params) if 'xgb'in cls else pd.DataFrame()
516
- top_xgb_features = xgb_importances.head(n_features)["feature"].values if 'xgb'in cls else []
517
-
518
- # SVM with RFE
519
- selected_svm_features = features_svm(X_train, y_train, rfe_params) if 'svm'in cls else []
742
+ xgb_importances = (
743
+ features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
744
+ )
745
+ top_xgb_features = (
746
+ xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
747
+ )
748
+
749
+ # SVM with RFE
750
+ selected_svm_features = (
751
+ features_svm(x_train, y_train, rfe_params) if "svm" in cls else []
752
+ )
520
753
  # Naive Bayes
521
- selected_naive_bayes_features=features_naive_bayes(X_train, y_train) if 'Naive Bayes'in cls else []
754
+ selected_naive_bayes_features = (
755
+ features_naive_bayes(x_train, y_train) if "Naive Bayes" in cls else []
756
+ )
522
757
  # lda: linear discriminant analysis
523
- lda_importances=features_lda(X_train, y_train) if 'lda'in cls else pd.DataFrame()
524
- selected_lda_features= lda_importances.head(n_features)["feature"].values if 'lda'in cls else []
525
- # AdaBoost Feature Importance
526
- adaboost_importances = features_adaboost(X_train, y_train, adaboost_params) if 'AdaBoost'in cls else pd.DataFrame()
527
- top_adaboost_features = adaboost_importances.head(n_features)["feature"].values if 'AdaBoost'in cls else []
758
+ lda_importances = features_lda(x_train, y_train) if "lda" in cls else pd.DataFrame()
759
+ selected_lda_features = (
760
+ lda_importances.head(n_features)["feature"].values if "lda" in cls else []
761
+ )
762
+ # AdaBoost Feature Importance
763
+ adaboost_importances = (
764
+ features_adaboost(x_train, y_train, adaboost_params)
765
+ if "AdaBoost" in cls
766
+ else pd.DataFrame()
767
+ )
768
+ top_adaboost_features = (
769
+ adaboost_importances.head(n_features)["feature"].values
770
+ if "AdaBoost" in cls
771
+ else []
772
+ )
528
773
  # Decision Tree Feature Importance
529
- dt_importances = features_decision_tree(X_train, y_train, dt_params) if 'Decision Tree' in cls else pd.DataFrame()
530
- top_dt_features = dt_importances.head(n_features)["feature"].values if 'Decision Tree' in cls else []
774
+ dt_importances = (
775
+ features_decision_tree(x_train, y_train, dt_params)
776
+ if "Decision Tree" in cls
777
+ else pd.DataFrame()
778
+ )
779
+ top_dt_features = (
780
+ dt_importances.head(n_features)["feature"].values
781
+ if "Decision Tree" in cls
782
+ else []
783
+ )
531
784
  # Bagging Feature Importance
532
- bagging_importances = features_bagging(X_train, y_train, bagging_params) if 'Bagging' in cls else pd.DataFrame()
533
- top_bagging_features = bagging_importances.head(n_features)["feature"].values if 'Bagging' in cls else []
785
+ bagging_importances = (
786
+ features_bagging(x_train, y_train, bagging_params)
787
+ if "Bagging" in cls
788
+ else pd.DataFrame()
789
+ )
790
+ top_bagging_features = (
791
+ bagging_importances.head(n_features)["feature"].values
792
+ if "Bagging" in cls
793
+ else []
794
+ )
534
795
  # KNN Feature Importance via Permutation
535
- knn_importances = features_knn(X_train, y_train, knn_params) if 'KNN' in cls else pd.DataFrame()
536
- top_knn_features = knn_importances.head(n_features)["feature"].values if 'KNN' in cls else []
796
+ knn_importances = (
797
+ features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
798
+ )
799
+ top_knn_features = (
800
+ knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
801
+ )
537
802
 
538
803
  #! Find common features
539
- common_features = ips.shared(lasso_selected_features,selected_ridge_features, selected_enet_features,
540
- top_rf_features,top_gb_features,top_xgb_features,
541
- selected_svm_features, selected_naive_bayes_features,selected_lda_features,
542
- top_adaboost_features,top_dt_features, top_bagging_features, top_knn_features,
543
- strict=strict,
544
- n_shared=n_shared
545
- )
804
+ common_features = ips.shared(
805
+ lasso_selected_features,
806
+ selected_ridge_features,
807
+ selected_enet_features,
808
+ top_rf_features,
809
+ top_gb_features,
810
+ top_xgb_features,
811
+ selected_svm_features,
812
+ selected_naive_bayes_features,
813
+ selected_lda_features,
814
+ top_adaboost_features,
815
+ top_dt_features,
816
+ top_bagging_features,
817
+ top_knn_features,
818
+ strict=strict,
819
+ n_shared=n_shared,
820
+ verbose=False
821
+ )
546
822
 
547
823
  # Use selected features or all features for model validation
548
- X_train_selected = X_train[list(common_features)] if use_selected_features else X_train
549
- X_test_selected = X_test[list(common_features)] if use_selected_features else X_test
824
+ x_train_selected = (
825
+ x_train[list(common_features)] if use_selected_features else x_train
826
+ )
827
+ x_test_selected = x_test[list(common_features)] if use_selected_features else x_test
550
828
 
551
829
  if metrics is None:
552
- metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
830
+ metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
553
831
 
554
832
  # Prepare results DataFrame for selected features
555
- features_df = pd.DataFrame({
556
- 'type':
557
- ['Lasso'] * len(lasso_selected_features)+
558
- ['Ridge'] * len(selected_ridge_features)+
559
- ['Random Forest'] * len(top_rf_features) +
560
- ['Gradient Boosting'] * len(top_gb_features)+
561
- ["Enet"]*len(selected_enet_features)+
562
- ['xgb'] * len(top_xgb_features)+
563
- ['SVM'] * len(selected_svm_features) +
564
- ['Naive Bayes'] * len(selected_naive_bayes_features)+
565
- ['Linear Discriminant Analysis'] * len(selected_lda_features)+
566
- ['AdaBoost'] * len(top_adaboost_features)+
567
- ['Decision Tree'] * len(top_dt_features) +
568
- ['Bagging'] * len(top_bagging_features) +
569
- ['KNN'] * len(top_knn_features),
570
- 'feature': np.concatenate([lasso_selected_features,selected_ridge_features,
571
- top_rf_features,top_gb_features,selected_enet_features,top_xgb_features,
572
- selected_svm_features,selected_naive_bayes_features,
573
- selected_lda_features,top_adaboost_features,top_dt_features,
574
- top_bagging_features, top_knn_features
575
- ])
576
- })
833
+ features_df = pd.DataFrame(
834
+ {
835
+ "type": ["Lasso"] * len(lasso_selected_features)
836
+ + ["Ridge"] * len(selected_ridge_features)
837
+ + ["Random Forest"] * len(top_rf_features)
838
+ + ["Gradient Boosting"] * len(top_gb_features)
839
+ + ["Enet"] * len(selected_enet_features)
840
+ + ["xgb"] * len(top_xgb_features)
841
+ + ["SVM"] * len(selected_svm_features)
842
+ + ["Naive Bayes"] * len(selected_naive_bayes_features)
843
+ + ["Linear Discriminant Analysis"] * len(selected_lda_features)
844
+ + ["AdaBoost"] * len(top_adaboost_features)
845
+ + ["Decision Tree"] * len(top_dt_features)
846
+ + ["Bagging"] * len(top_bagging_features)
847
+ + ["KNN"] * len(top_knn_features),
848
+ "feature": np.concatenate(
849
+ [
850
+ lasso_selected_features,
851
+ selected_ridge_features,
852
+ top_rf_features,
853
+ top_gb_features,
854
+ selected_enet_features,
855
+ top_xgb_features,
856
+ selected_svm_features,
857
+ selected_naive_bayes_features,
858
+ selected_lda_features,
859
+ top_adaboost_features,
860
+ top_dt_features,
861
+ top_bagging_features,
862
+ top_knn_features,
863
+ ]
864
+ ),
865
+ }
866
+ )
577
867
 
578
868
  #! Validate trained each classifier
579
- classifiers=get_classifiers(random_state=random_state,cls=cls)
580
- cv_train_results,cv_test_results = [],[]
581
- for name, clf in classifiers.items():
582
- if not X_train_selected.empty:
583
- cv_scores=validate_classifier(clf,
584
- X_train_selected,
585
- y_train,
586
- X_test_selected,
587
- y_test,
588
- metrics=metrics,
589
- cv_folds=cv_folds)
869
+ models = get_models(random_state=random_state, cls=cls)
870
+ cv_train_results, cv_test_results = [], []
871
+ for name, clf in models.items():
872
+ if not x_train_selected.empty:
873
+ cv_scores = validate_classifier(
874
+ clf,
875
+ x_train_selected,
876
+ y_train,
877
+ x_test_selected,
878
+ y_test,
879
+ metrics=metrics,
880
+ cv_folds=cv_folds,
881
+ )
590
882
 
591
883
  cv_train_score_df = pd.DataFrame(cv_scores["cv_train_scores"], index=[name])
592
884
  cv_test_score_df = pd.DataFrame(cv_scores["cv_test_scores"], index=[name])
593
885
  cv_train_results.append(cv_train_score_df)
594
886
  cv_test_results.append(cv_test_score_df)
595
- if all([cv_train_results,cv_train_results]):
596
- cv_train_results_df = pd.concat(cv_train_results).reset_index().rename(columns={'index': 'Classifier'})
597
- cv_test_results_df = pd.concat(cv_test_results).reset_index().rename(columns={'index': 'Classifier'})
887
+ if all([cv_train_results, cv_test_results]):
888
+ cv_train_results_df = (
889
+ pd.concat(cv_train_results)
890
+ .reset_index()
891
+ .rename(columns={"index": "Classifier"})
892
+ )
893
+ cv_test_results_df = (
894
+ pd.concat(cv_test_results)
895
+ .reset_index()
896
+ .rename(columns={"index": "Classifier"})
897
+ )
598
898
  #! Store results in the main results dictionary
599
899
  results = {
600
900
  "selected_features": features_df,
601
901
  "cv_train_scores": cv_train_results_df,
602
- "cv_test_scores": cv_test_results_df,
902
+ "cv_test_scores": rank_models(cv_test_results_df,plot_=plot_),
603
903
  "common_features": list(common_features),
604
904
  }
905
+ if all([plot_,dir_save]):
906
+ from datetime import datetime
907
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
908
+ ips.figsave(dir_save+f"features{now_}.pdf")
605
909
  else:
606
910
  results = {
607
911
  "selected_features": pd.DataFrame(),
@@ -611,71 +915,75 @@ def get_features(
611
915
  }
612
916
  print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
613
917
  return results
918
+
919
+
614
920
  #! # usage:
615
921
  # # Get features and common features
616
922
  # results = get_features(X, y)
617
923
  # common_features = results["common_features"]
618
924
  def validate_features(
619
- X_train: pd.DataFrame,
925
+ x_train: pd.DataFrame,
620
926
  y_train: pd.Series,
621
- X_true: pd.DataFrame,
927
+ x_true: pd.DataFrame,
622
928
  y_true: pd.Series,
623
- common_features:set=None,
624
- classifiers: Optional[Dict[str, Any]] = None,
929
+ common_features: set = None,
930
+ models: Optional[Dict[str, Any]] = None,
625
931
  metrics: Optional[list] = None,
626
932
  random_state: int = 1,
627
933
  smote: bool = False,
934
+ n_jobs:int = -1,
628
935
  plot_: bool = True,
629
936
  class_weight: str = "balanced",
630
937
  ) -> dict:
631
938
  """
632
- Validate classifiers using selected features on the validation dataset.
939
+ Validate models using selected features on the validation dataset.
633
940
 
634
941
  Parameters:
635
- - X_train (pd.DataFrame): Training feature dataset.
942
+ - x_train (pd.DataFrame): Training feature dataset.
636
943
  - y_train (pd.Series): Training target variable.
637
- - X_true (pd.DataFrame): Validation feature dataset.
944
+ - x_true (pd.DataFrame): Validation feature dataset.
638
945
  - y_true (pd.Series): Validation target variable.
639
946
  - common_features (set): Set of common features to use for validation.
640
- - classifiers (dict, optional): Dictionary of classifiers to validate.
947
+ - models (dict, optional): Dictionary of models to validate.
641
948
  - metrics (list, optional): List of metrics to compute.
642
949
  - random_state (int): Random state for reproducibility.
643
950
  - plot_ (bool): Option to plot metrics (to be implemented if needed).
644
951
  - class_weight (str or dict): Class weights to handle imbalance.
645
952
 
646
953
  """
647
-
954
+ from tqdm import tqdm
648
955
  # Ensure common features are selected
649
- common_features = ips.shared(common_features,
650
- X_train.columns,
651
- X_true.columns,
652
- strict=True)
956
+ common_features = ips.shared(common_features, x_train.columns, x_true.columns, strict=True,verbose=False)
653
957
 
654
958
  # Filter the training and validation datasets for the common features
655
- X_train_selected = X_train[common_features]
656
- X_true_selected = X_true[common_features]
959
+ x_train_selected = x_train[common_features]
960
+ x_true_selected = x_true[common_features]
657
961
 
658
- if not X_true_selected.index.equals(y_true.index):
659
- raise ValueError("Index mismatch between validation features and target. Ensure data alignment.")
660
-
661
- y_true= y_true.loc[X_true_selected.index]
962
+ if not x_true_selected.index.equals(y_true.index):
963
+ raise ValueError(
964
+ "Index mismatch between validation features and target. Ensure data alignment."
965
+ )
966
+
967
+ y_true = y_true.loc[x_true_selected.index]
662
968
 
663
969
  # Handle class imbalance using SMOTE
664
970
  if smote:
665
- if y_train.value_counts(normalize=True).max() < 0.8: # Threshold to decide if data is imbalanced
971
+ if (
972
+ y_train.value_counts(normalize=True).max() < 0.8
973
+ ): # Threshold to decide if data is imbalanced
666
974
  smote = SMOTE(random_state=random_state)
667
- X_train_resampled, y_train_resampled = smote.fit_resample(
668
- X_train_selected, y_train
975
+ x_train_resampled, y_train_resampled = smote.fit_resample(
976
+ x_train_selected, y_train
669
977
  )
670
978
  else:
671
979
  # skip SMOTE
672
- X_train_resampled, y_train_resampled = X_train_selected, y_train
980
+ x_train_resampled, y_train_resampled = x_train_selected, y_train
673
981
  else:
674
- X_train_resampled, y_train_resampled = X_train_selected, y_train
982
+ x_train_resampled, y_train_resampled = x_train_selected, y_train
675
983
 
676
- # Default classifiers if not provided
677
- if classifiers is None:
678
- classifiers = {
984
+ # Default models if not provided
985
+ if models is None:
986
+ models = {
679
987
  "Random Forest": RandomForestClassifier(
680
988
  class_weight=class_weight, random_state=random_state
681
989
  ),
@@ -684,86 +992,107 @@ def validate_features(
684
992
  class_weight=class_weight, random_state=random_state
685
993
  ),
686
994
  "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
687
- "AdaBoost": AdaBoostClassifier(random_state=random_state, algorithm="SAMME"),
688
- "Lasso": LogisticRegression(penalty='l1', solver='saga', random_state=random_state),
689
- "Ridge": LogisticRegression(penalty='l2', solver='saga', random_state=random_state),
690
- "Elastic Net": LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=random_state),
691
- "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
995
+ "AdaBoost": AdaBoostClassifier(
996
+ random_state=random_state, algorithm="SAMME"
997
+ ),
998
+ "Lasso": LogisticRegression(
999
+ penalty="l1", solver="saga", random_state=random_state
1000
+ ),
1001
+ "Ridge": LogisticRegression(
1002
+ penalty="l2", solver="saga", random_state=random_state
1003
+ ),
1004
+ "Elastic Net": LogisticRegression(
1005
+ penalty="elasticnet",
1006
+ solver="saga",
1007
+ l1_ratio=0.5,
1008
+ random_state=random_state,
1009
+ ),
1010
+ "XGBoost": xgb.XGBClassifier(eval_metric="logloss"
1011
+ ),
692
1012
  "Naive Bayes": GaussianNB(),
693
- "LDA": LinearDiscriminantAnalysis()
1013
+ "LDA": LinearDiscriminantAnalysis(),
694
1014
  }
695
1015
 
696
- # Hyperparameter grids for tuning
1016
+ # Hyperparameter grids for tuning
697
1017
  param_grids = {
698
1018
  "Random Forest": {
699
- 'n_estimators': [100, 200, 300, 400, 500],
700
- 'max_depth': [None, 3, 5, 10, 20],
701
- 'min_samples_split': [2, 5, 10],
702
- 'min_samples_leaf': [1, 2, 4],
703
- 'class_weight': [None, 'balanced']
1019
+ "n_estimators": [100, 200, 300, 400, 500],
1020
+ "max_depth": [None, 3, 5, 10, 20],
1021
+ "min_samples_split": [2, 5, 10],
1022
+ "min_samples_leaf": [1, 2, 4],
1023
+ "class_weight": [None, "balanced"],
704
1024
  },
705
1025
  "SVM": {
706
- 'C': [0.01, 0.1, 1, 10, 100, 1000],
707
- 'gamma': [0.001, 0.01, 0.1, 'scale', 'auto'],
708
- 'kernel': ['linear', 'rbf', 'poly']
1026
+ "C": [0.01, 0.1, 1, 10, 100, 1000],
1027
+ "gamma": [0.001, 0.01, 0.1, "scale", "auto"],
1028
+ "kernel": ["linear", "rbf", "poly"],
709
1029
  },
710
1030
  "Logistic Regression": {
711
- 'C': [0.01, 0.1, 1, 10, 100],
712
- 'solver': ['liblinear', 'saga', 'newton-cg', 'lbfgs'],
713
- 'penalty': ['l1', 'l2'],
714
- 'max_iter': [100, 200, 300]
1031
+ "C": [0.01, 0.1, 1, 10, 100],
1032
+ "solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
1033
+ "penalty": ["l1", "l2"],
1034
+ "max_iter": [100, 200, 300],
715
1035
  },
716
1036
  "Gradient Boosting": {
717
- 'n_estimators': [100, 200, 300, 400, 500],
718
- 'learning_rate': np.logspace(-3, 0, 4),
719
- 'max_depth': [3, 5, 7, 9],
720
- 'min_samples_split': [2, 5, 10]
1037
+ "n_estimators": [100, 200, 300, 400, 500],
1038
+ "learning_rate": np.logspace(-3, 0, 4),
1039
+ "max_depth": [3, 5, 7, 9],
1040
+ "min_samples_split": [2, 5, 10],
721
1041
  },
722
1042
  "AdaBoost": {
723
- 'n_estimators': [50, 100, 200, 300, 500],
724
- 'learning_rate': np.logspace(-3, 0, 4)
725
- },
726
- "Lasso": {
727
- 'C': np.logspace(-3, 1, 10),
728
- 'max_iter': [100, 200, 300]
729
- },
730
- "Ridge": {
731
- 'C': np.logspace(-3, 1, 10),
732
- 'max_iter': [100, 200, 300]
1043
+ "n_estimators": [50, 100, 200, 300, 500],
1044
+ "learning_rate": np.logspace(-3, 0, 4),
733
1045
  },
1046
+ "Lasso": {"C": np.logspace(-3, 1, 10), "max_iter": [100, 200, 300]},
1047
+ "Ridge": {"C": np.logspace(-3, 1, 10), "max_iter": [100, 200, 300]},
734
1048
  "Elastic Net": {
735
- 'C': np.logspace(-3, 1, 10),
736
- 'l1_ratio': [0.1, 0.5, 0.9],
737
- 'max_iter': [100, 200, 300]
1049
+ "C": np.logspace(-3, 1, 10),
1050
+ "l1_ratio": [0.1, 0.5, 0.9],
1051
+ "max_iter": [100, 200, 300],
738
1052
  },
739
1053
  "XGBoost": {
740
- 'n_estimators': [100, 200],
741
- 'max_depth': [3, 5, 7],
742
- 'learning_rate': [0.01, 0.1, 0.2],
743
- 'subsample': [0.8, 1.0],
744
- 'colsample_bytree': [0.8, 1.0]
1054
+ "n_estimators": [100, 200],
1055
+ "max_depth": [3, 5, 7],
1056
+ "learning_rate": [0.01, 0.1, 0.2],
1057
+ "subsample": [0.8, 1.0],
1058
+ "colsample_bytree": [0.8, 1.0],
745
1059
  },
746
1060
  "Naive Bayes": {},
747
- "LDA": {
748
- 'solver': ['svd', 'lsqr', 'eigen']
749
- }
1061
+ "LDA": {"solver": ["svd", "lsqr", "eigen"]},
750
1062
  }
751
1063
  # Default metrics if not provided
752
1064
  if metrics is None:
753
- metrics = ["accuracy", "precision", "recall", "f1", "roc_auc", "mcc", "specificity", "balanced_accuracy", "pr_auc"]
1065
+ metrics = [
1066
+ "accuracy",
1067
+ "precision",
1068
+ "recall",
1069
+ "f1",
1070
+ "roc_auc",
1071
+ "mcc",
1072
+ "specificity",
1073
+ "balanced_accuracy",
1074
+ "pr_auc",
1075
+ ]
754
1076
 
755
1077
  results = {}
756
1078
 
757
1079
  # Validate each classifier with GridSearchCV
758
- for name, clf in classifiers.items():
1080
+ for name, clf in tqdm(
1081
+ models.items(),
1082
+ desc="for metric in metrics",
1083
+ colour="green",
1084
+ bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
1085
+ ):
759
1086
  print(f"\nValidating {name} on the validation dataset:")
760
1087
 
761
1088
  # Check if `predict_proba` method exists; if not, use CalibratedClassifierCV
762
1089
  # 没有predict_proba的分类器,使用 CalibratedClassifierCV 可以获得校准的概率估计。此外,为了使代码更灵活,我们可以在创建分类器
763
1090
  # 时检查 predict_proba 方法是否存在,如果不存在且用户希望计算 roc_auc 或 pr_auc,则启用 CalibratedClassifierCV
764
1091
  if not hasattr(clf, "predict_proba"):
765
- print(f"Using CalibratedClassifierCV for {name} due to lack of probability estimates.")
766
- calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit')
1092
+ print(
1093
+ f"Using CalibratedClassifierCV for {name} due to lack of probability estimates."
1094
+ )
1095
+ calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
767
1096
  else:
768
1097
  calibrated_clf = clf
769
1098
  # Stratified K-Fold for cross-validation
@@ -771,28 +1100,30 @@ def validate_features(
771
1100
 
772
1101
  # Create GridSearchCV object
773
1102
  gs = GridSearchCV(
774
- estimator= calibrated_clf,
1103
+ estimator=calibrated_clf,
775
1104
  param_grid=param_grids[name],
776
1105
  scoring="roc_auc", # Optimize for ROC AUC
777
1106
  cv=skf, # Stratified K-Folds cross-validation
778
- n_jobs=-1,
1107
+ n_jobs=n_jobs,
779
1108
  verbose=1,
780
1109
  )
781
1110
 
782
1111
  # Fit the model using GridSearchCV
783
- gs.fit(X_train_resampled, y_train_resampled)
1112
+ gs.fit(x_train_resampled, y_train_resampled)
784
1113
  # Best estimator from grid search
785
1114
  best_clf = gs.best_estimator_
786
1115
  # Make predictions on the validation set
787
- y_pred = best_clf.predict(X_true_selected)
1116
+ y_pred = best_clf.predict(x_true_selected)
788
1117
  # Calculate probabilities for ROC AUC if possible
789
1118
  if hasattr(best_clf, "predict_proba"):
790
- y_pred_proba = best_clf.predict_proba(X_true_selected)[:, 1]
1119
+ y_pred_proba = best_clf.predict_proba(x_true_selected)[:, 1]
791
1120
  elif hasattr(best_clf, "decision_function"):
792
1121
  # If predict_proba is not available, use decision_function (e.g., for SVM)
793
- y_pred_proba = best_clf.decision_function(X_true_selected)
1122
+ y_pred_proba = best_clf.decision_function(x_true_selected)
794
1123
  # Ensure y_pred_proba is within 0 and 1 bounds
795
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min())
1124
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
1125
+ y_pred_proba.max() - y_pred_proba.min()
1126
+ )
796
1127
  else:
797
1128
  y_pred_proba = None # No probability output for certain models
798
1129
 
@@ -802,11 +1133,15 @@ def validate_features(
802
1133
  if metric == "accuracy":
803
1134
  validation_scores[metric] = accuracy_score(y_true, y_pred)
804
1135
  elif metric == "precision":
805
- validation_scores[metric] = precision_score(y_true, y_pred, average='weighted')
1136
+ validation_scores[metric] = precision_score(
1137
+ y_true, y_pred, average="weighted"
1138
+ )
806
1139
  elif metric == "recall":
807
- validation_scores[metric] = recall_score(y_true, y_pred, average='weighted')
1140
+ validation_scores[metric] = recall_score(
1141
+ y_true, y_pred, average="weighted"
1142
+ )
808
1143
  elif metric == "f1":
809
- validation_scores[metric] = f1_score(y_true, y_pred, average='weighted')
1144
+ validation_scores[metric] = f1_score(y_true, y_pred, average="weighted")
810
1145
  elif metric == "roc_auc" and y_pred_proba is not None:
811
1146
  validation_scores[metric] = roc_auc_score(y_true, y_pred_proba)
812
1147
  elif metric == "mcc":
@@ -816,32 +1151,35 @@ def validate_features(
816
1151
  validation_scores[metric] = tn / (tn + fp) # Specificity calculation
817
1152
  elif metric == "balanced_accuracy":
818
1153
  validation_scores[metric] = balanced_accuracy_score(y_true, y_pred)
819
- elif metric == "pr_auc" and y_pred_proba is not None:
1154
+ elif metric == "pr_auc" and y_pred_proba is not None:
820
1155
  precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
821
- validation_scores[metric] = average_precision_score(y_true, y_pred_proba)
822
-
1156
+ validation_scores[metric] = average_precision_score(
1157
+ y_true, y_pred_proba
1158
+ )
1159
+
823
1160
  # Calculate ROC curve
824
- #https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
1161
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
825
1162
  if y_pred_proba is not None:
826
1163
  # fpr, tpr, roc_auc = dict(), dict(), dict()
827
1164
  fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
828
- lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba)
829
- roc_auc=auc(fpr, tpr)
830
- roc_info={
1165
+ lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba,verbose=False)
1166
+ roc_auc = auc(fpr, tpr)
1167
+ roc_info = {
831
1168
  "fpr": fpr.tolist(),
832
1169
  "tpr": tpr.tolist(),
833
- "auc":roc_auc,
834
- "ci95":(lower_ci, upper_ci)
1170
+ "auc": roc_auc,
1171
+ "ci95": (lower_ci, upper_ci),
835
1172
  }
836
1173
  # precision-recall curve
837
- precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
1174
+ precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
838
1175
  avg_precision_ = average_precision_score(y_true, y_pred_proba)
839
- pr_info = {"precision": precision_,
840
- "recall":recall_,
841
- "avg_precision":avg_precision_
842
- }
1176
+ pr_info = {
1177
+ "precision": precision_,
1178
+ "recall": recall_,
1179
+ "avg_precision": avg_precision_,
1180
+ }
843
1181
  else:
844
- roc_info,pr_info=None,None
1182
+ roc_info, pr_info = None, None
845
1183
  results[name] = {
846
1184
  "best_params": gs.best_params_,
847
1185
  "scores": validation_scores,
@@ -849,24 +1187,93 @@ def validate_features(
849
1187
  "pr_curve": pr_info,
850
1188
  "confusion_matrix": confusion_matrix(y_true, y_pred),
851
1189
  }
852
-
1190
+
853
1191
  df_results = pd.DataFrame.from_dict(results, orient="index")
854
1192
 
855
1193
  return df_results
856
1194
 
857
- #! usage validate_features()
858
- # Validate classifiers using the validation dataset (X_val, y_val)
1195
+
1196
+ #! usage validate_features()
1197
+ # Validate models using the validation dataset (X_val, y_val)
859
1198
  # validation_results = validate_features(X, y, X_val, y_val, common_features)
860
1199
 
861
1200
  # # If you want to access validation scores
862
1201
  # print(validation_results)
1202
+ def plot_validate_features(res_val):
1203
+ """
1204
+ plot the results of 'validate_features()'
1205
+ """
1206
+ colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
1207
+ if res_val.shape[0]>5:
1208
+ alpha=0
1209
+ figsize=[8,10]
1210
+ subplot_layout=[1,2]
1211
+ ncols=2
1212
+ bbox_to_anchor=[1.5,0.6]
1213
+ else:
1214
+ alpha=0.03
1215
+ figsize=[10,6]
1216
+ subplot_layout=[1,1]
1217
+ ncols=1
1218
+ bbox_to_anchor=[1,1]
1219
+ nexttile = plot.subplot(figsize=figsize)
1220
+ ax = nexttile(subplot_layout[0],subplot_layout[1])
1221
+ for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1222
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1223
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1224
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1225
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1226
+ plot_roc_curve(
1227
+ fpr,tpr,mean_auc,lower_ci,upper_ci,model_name=model_name,
1228
+ lw=1.5,color=colors[i],alpha=alpha,ax=ax)
1229
+ plot.figsets(sp=2,legend=dict(loc="upper right", ncols=ncols, fontsize=8, bbox_to_anchor=[1.5,0.6],markerscale=0.8))
1230
+ # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
1231
+
1232
+ ax = nexttile(subplot_layout[0],subplot_layout[1])
1233
+ for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1234
+ plot_pr_curve(
1235
+ recall=res_val["pr_curve"][model_name]["recall"],
1236
+ precision=res_val["pr_curve"][model_name]["precision"],
1237
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1238
+ model_name=model_name,
1239
+ color=colors[i],lw=1.5,alpha=alpha,ax=ax)
1240
+ plot.figsets(sp=2,legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5,0.5]))
1241
+ # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
863
1242
 
864
-
865
- def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95,random_state=1):
1243
+ def plot_validate_features_single(res_val,figsize=None):
1244
+ if figsize is None:
1245
+ nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3)
1246
+ else:
1247
+ nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3,figsize=figsize)
1248
+ for model_name in ips.flatten(res_val["pr_curve"].index):
1249
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1250
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1251
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1252
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1253
+
1254
+ # Plotting
1255
+ plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci, ax=nexttile())
1256
+ plot.figsets(title=model_name, sp=2)
1257
+
1258
+ plot_pr_curve(
1259
+ recall=res_val["pr_curve"][model_name]["recall"],
1260
+ precision=res_val["pr_curve"][model_name]["precision"],
1261
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1262
+ model_name=model_name,
1263
+ ax=nexttile(),
1264
+ )
1265
+ plot.figsets(title=model_name, sp=2)
1266
+
1267
+ # plot cm
1268
+ plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
1269
+ plot.figsets(title=model_name, sp=2)
1270
+
1271
+ def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1,verbose=True):
866
1272
  y_true = np.asarray(y_true)
867
1273
  y_pred = np.asarray(y_pred)
868
1274
  bootstrapped_scores = []
869
- print("auroc score:", roc_auc_score(y_true, y_pred))
1275
+ if verbose:
1276
+ print("auroc score:", roc_auc_score(y_true, y_pred))
870
1277
  rng = np.random.RandomState(random_state)
871
1278
  for i in range(n_bootstraps):
872
1279
  # bootstrap by sampling with replacement on the prediction indices
@@ -887,21 +1294,24 @@ def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95,random_state=1):
887
1294
  # Computing the lower and upper bound of the 90% confidence interval
888
1295
  # You can change the bounds percentiles to 0.025 and 0.975 to get
889
1296
  # a 95% confidence interval instead.
890
- confidence_lower = sorted_scores[int((1-ci) * len(sorted_scores))]
1297
+ confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
891
1298
  confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
892
- print(
1299
+ if verbose:
1300
+ print(
893
1301
  "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
894
1302
  confidence_lower, confidence_upper
895
1303
  )
896
1304
  )
897
1305
  return confidence_lower, confidence_upper
898
1306
 
1307
+
899
1308
  def plot_roc_curve(
900
1309
  fpr=None,
901
1310
  tpr=None,
902
1311
  mean_auc=None,
903
1312
  lower_ci=None,
904
1313
  upper_ci=None,
1314
+ model_name=None,
905
1315
  color="#FF8F00",
906
1316
  lw=2,
907
1317
  alpha=0.1,
@@ -913,24 +1323,23 @@ def plot_roc_curve(
913
1323
  diagonal_color="0.5",
914
1324
  figsize=(5, 5),
915
1325
  ax=None,
916
- **kwargs
1326
+ **kwargs,
917
1327
  ):
918
1328
  if ax is None:
919
1329
  fig, ax = plt.subplots(figsize=figsize)
920
1330
  if mean_auc is not None:
1331
+ model_name = "ROC curve" if model_name is None else model_name
921
1332
  if ci_display:
922
- label = (
923
- f"ROC curve (AUC = {mean_auc:.3f})\n95% CI: {lower_ci:.3f} - {upper_ci:.3f}"
924
- )
1333
+ label = f"{model_name} (AUC = {mean_auc:.3f})\n95% CI: {lower_ci:.3f} - {upper_ci:.3f}"
925
1334
  else:
926
- label = f"ROC curve (AUC = {mean_auc:.3f})"
1335
+ label = f"{model_name} (AUC = {mean_auc:.3f})"
927
1336
  else:
928
1337
  label = None
929
1338
 
930
1339
  # Plot ROC curve and the diagonal reference line
931
1340
  ax.fill_between(fpr, tpr, alpha=alpha, color=color)
932
- ax.plot([0, 1], [0, 1], color=diagonal_color, linestyle="--")
933
- ax.plot(fpr, tpr, color=color, lw=lw, label=label,**kwargs)
1341
+ ax.plot([0, 1], [0, 1], color=diagonal_color, clip_on=False, linestyle="--")
1342
+ ax.plot(fpr, tpr, color=color, lw=lw, label=label,clip_on=False, **kwargs)
934
1343
  # Setting plot limits, labels, and title
935
1344
  ax.set_xlim([-0.01, 1.0])
936
1345
  ax.set_ylim([0.0, 1.0])
@@ -939,7 +1348,9 @@ def plot_roc_curve(
939
1348
  ax.set_title(title)
940
1349
  ax.legend(loc=legend_loc)
941
1350
  return ax
942
- #* usage: ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
1351
+
1352
+
1353
+ # * usage: ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
943
1354
  # for model_name in flatten(validation_results["roc_curve"].keys())[2:]:
944
1355
  # fpr = validation_results["roc_curve"][model_name]["fpr"]
945
1356
  # tpr = validation_results["roc_curve"][model_name]["tpr"]
@@ -950,6 +1361,7 @@ def plot_roc_curve(
950
1361
  # ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
951
1362
  # figsets(title=model_name)
952
1363
 
1364
+
953
1365
  def plot_pr_curve(
954
1366
  recall=None,
955
1367
  precision=None,
@@ -961,21 +1373,24 @@ def plot_pr_curve(
961
1373
  xlabel="Recall",
962
1374
  ylabel="Precision",
963
1375
  alpha=0.1,
964
- color="#FF8F00",
1376
+ color="#FF8F00",
965
1377
  legend_loc="lower left",
966
1378
  ax=None,
967
- **kwargs
1379
+ **kwargs,
968
1380
  ):
969
1381
  if ax is None:
970
1382
  fig, ax = plt.subplots(figsize=figsize)
971
-
1383
+ model_name = "PR curve" if model_name is None else model_name
972
1384
  # Plot Precision-Recall curve
973
- ax.plot(recall,
974
- precision,
975
- lw=lw,
976
- color=color,
977
- label=( f"PR curve (AUC={avg_precision:.2f})"),
978
- **kwargs)
1385
+ ax.plot(
1386
+ recall,
1387
+ precision,
1388
+ lw=lw,
1389
+ color=color,
1390
+ label=(f"{model_name} (AUC={avg_precision:.2f})"),
1391
+ clip_on=False,
1392
+ **kwargs,
1393
+ )
979
1394
  # Fill area under the curve
980
1395
  ax.fill_between(recall, precision, alpha=alpha, color=color)
981
1396
 
@@ -985,10 +1400,12 @@ def plot_pr_curve(
985
1400
  ax.set_ylabel(ylabel)
986
1401
  ax.set_xlim([-0.01, 1.0])
987
1402
  ax.set_ylim([0.0, 1.0])
988
- ax.grid(False)
1403
+ ax.grid(False)
989
1404
  ax.legend(loc=legend_loc)
990
1405
  return ax
991
- #* usage: ml2ls.plot_pr_curve()
1406
+
1407
+
1408
+ # * usage: ml2ls.plot_pr_curve()
992
1409
  # for md_name in flatten(validation_results["pr_curve"].keys()):
993
1410
  # ml2ls.plot_pr_curve(
994
1411
  # recall=validation_results["pr_curve"][md_name]["recall"],
@@ -1000,6 +1417,7 @@ def plot_pr_curve(
1000
1417
  # color="r",
1001
1418
  # )
1002
1419
 
1420
+
1003
1421
  def plot_cm(
1004
1422
  cm,
1005
1423
  labels_name=None,
@@ -1016,7 +1434,9 @@ def plot_cm(
1016
1434
  if ax is None:
1017
1435
  fig, ax = plt.subplots(figsize=figsize)
1018
1436
 
1019
- cm_normalized = np.round(cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100, 2)
1437
+ cm_normalized = np.round(
1438
+ cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100, 2
1439
+ )
1020
1440
  cm_value = cm_normalized if normalize else cm.astype("int")
1021
1441
  # Plot the heatmap
1022
1442
  cax = ax.imshow(cm_normalized, interpolation="nearest", cmap=cmap)
@@ -1026,14 +1446,13 @@ def plot_cm(
1026
1446
  # Define tick labels based on provided labels
1027
1447
  num_local = np.arange(len(labels_name)) if labels_name is not None else range(2)
1028
1448
  if axis_labels is None:
1029
- axis_labels = labels_name if labels_name is not None else ["No","Yes"]
1449
+ axis_labels = labels_name if labels_name is not None else ["No", "Yes"]
1030
1450
  ax.set_xticks(num_local)
1031
1451
  ax.set_xticklabels(axis_labels)
1032
1452
  ax.set_yticks(num_local)
1033
1453
  ax.set_yticklabels(axis_labels)
1034
1454
  ax.set_ylabel(ylabel)
1035
1455
  ax.set_xlabel(xlabel)
1036
- plot.figsets(ax=ax, xtickloc="tl", boxloc="none")
1037
1456
 
1038
1457
  # Add TN, FP, FN, TP annotations specifically for binary classification (2x2 matrix)
1039
1458
  if labels_name is None or len(labels_name) == 2:
@@ -1050,29 +1469,53 @@ def plot_cm(
1050
1469
  tp_label = "TP"
1051
1470
 
1052
1471
  # Adjust positions slightly for TN, FP, FN, TP labels
1053
- ax.text(0,0,
1054
- f"{tn_label}:{cm_normalized[0, 0]:.2f}%" if normalize else f"{tn_label}:{cm_value[0, 0]}",
1472
+ ax.text(
1473
+ 0,
1474
+ 0,
1475
+ (
1476
+ f"{tn_label}:{cm_normalized[0, 0]:.2f}%"
1477
+ if normalize
1478
+ else f"{tn_label}:{cm_value[0, 0]}"
1479
+ ),
1055
1480
  ha="center",
1056
1481
  va="center",
1057
1482
  color="white" if cm_normalized[0, 0] > thresh * 100 else "black",
1058
1483
  fontsize=fontsize,
1059
1484
  )
1060
- ax.text(1,0,
1061
- f"{fp_label}:{cm_normalized[0, 1]:.2f}%" if normalize else f"{tn_label}:{cm_value[0, 1]}",
1485
+ ax.text(
1486
+ 1,
1487
+ 0,
1488
+ (
1489
+ f"{fp_label}:{cm_normalized[0, 1]:.2f}%"
1490
+ if normalize
1491
+ else f"{fp_label}:{cm_value[0, 1]}"
1492
+ ),
1062
1493
  ha="center",
1063
1494
  va="center",
1064
1495
  color="white" if cm_normalized[0, 1] > thresh * 100 else "black",
1065
1496
  fontsize=fontsize,
1066
1497
  )
1067
- ax.text(0,1,
1068
- f"{fn_label}:{cm_normalized[1, 0]:.2f}%" if normalize else f"{tn_label}:{cm_value[1, 0]}",
1498
+ ax.text(
1499
+ 0,
1500
+ 1,
1501
+ (
1502
+ f"{fn_label}:{cm_normalized[1, 0]:.2f}%"
1503
+ if normalize
1504
+ else f"{fn_label}:{cm_value[1, 0]}"
1505
+ ),
1069
1506
  ha="center",
1070
1507
  va="center",
1071
1508
  color="white" if cm_normalized[1, 0] > thresh * 100 else "black",
1072
1509
  fontsize=fontsize,
1073
1510
  )
1074
- ax.text(1,1,
1075
- f"{tp_label}:{cm_normalized[1, 1]:.2f}%" if normalize else f"{tn_label}:{cm_value[1, 1]}",
1511
+ ax.text(
1512
+ 1,
1513
+ 1,
1514
+ (
1515
+ f"{tp_label}:{cm_normalized[1, 1]:.2f}%"
1516
+ if normalize
1517
+ else f"{tp_label}:{cm_value[1, 1]}"
1518
+ ),
1076
1519
  ha="center",
1077
1520
  va="center",
1078
1521
  color="white" if cm_normalized[1, 1] > thresh * 100 else "black",
@@ -1084,11 +1527,1054 @@ def plot_cm(
1084
1527
  for j in range(len(labels_name)):
1085
1528
  val = cm_normalized[i, j]
1086
1529
  color = "white" if val > thresh * 100 else "black"
1087
- ax.text(j,i,
1530
+ ax.text(
1531
+ j,
1532
+ i,
1088
1533
  f"{val:.2f}%",
1089
1534
  ha="center",
1090
1535
  va="center",
1091
1536
  color=color,
1092
1537
  fontsize=fontsize,
1093
1538
  )
1539
+
1540
+ plot.figsets(ax=ax,
1541
+ boxloc="none"
1542
+ )
1094
1543
  return ax
1544
+
1545
+ def rank_models(
1546
+ cv_test_scores,
1547
+ rm_outlier=False,
1548
+ metric_weights=None,
1549
+ plot_=True,
1550
+ ):
1551
+ """
1552
+ Selects the best model based on a multi-metric scoring approach, with outlier handling, optional visualization,
1553
+ and additional performance metrics.
1554
+
1555
+ Parameters:
1556
+ - cv_test_scores (pd.DataFrame): DataFrame with cross-validation results across multiple metrics.
1557
+ Assumes columns are 'Classifier', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'.
1558
+ - metric_weights (dict): Dictionary specifying weights for each metric (e.g., {'accuracy': 0.2, 'precision': 0.3, ...}).
1559
+ If None, default weights are applied equally across available metrics.
1560
+ a. equal_weights(standard approch): 所有的metrics同等重要
1561
+ e.g., {"accuracy": 0.2, "precision": 0.2, "recall": 0.2, "f1": 0.2, "roc_auc": 0.2}
1562
+ b. accuracy_focosed: classification correctness (e.g., in balanced datasets), accuracy might be weighted more heavily.
1563
+ e.g., {"accuracy": 0.4, "precision": 0.2, "recall": 0.2, "f1": 0.1, "roc_auc": 0.1}
1564
+ c. Precision and Recall Emphasis: In cases where false positives and false negatives are particularly important (such as
1565
+ in medical applications or fraud detection), precision and recall may be weighted more heavily.
1566
+ e.g., {"accuracy": 0.2, "precision": 0.3, "recall": 0.3, "f1": 0.1, "roc_auc": 0.1}
1567
+ d. F1-Focused: When balance between precision and recall is crucial (e.g., in imbalanced datasets)
1568
+ e.g., {"accuracy": 0.2, "precision": 0.2, "recall": 0.2, "f1": 0.3, "roc_auc": 0.1}
1569
+ e. ROC-AUC Emphasis: In some cases, ROC AUC may be prioritized, particularly in classification tasks where class imbalance
1570
+ is present, as ROC AUC accounts for the model's performance across all classification thresholds.
1571
+ e.g., {"accuracy": 0.1, "precision": 0.2, "recall": 0.2, "f1": 0.3, "roc_auc": 0.3}
1572
+
1573
+ - normalize (bool): Whether to normalize scores of each metric to range [0, 1].
1574
+ - visualize (bool): If True, generates visualizations (e.g., bar plot, radar chart).
1575
+ - outlier_threshold (float): The threshold to detect outliers using the IQR method. Default is 1.5.
1576
+ - cv_folds (int): The number of cross-validation folds used.
1577
+
1578
+ Returns:
1579
+ - best_model (str): Name of the best model based on the combined metric scores.
1580
+ - scored_df (pd.DataFrame): DataFrame with an added 'combined_score' column used for model selection.
1581
+ - visualizations (dict): A dictionary containing visualizations if `visualize=True`.
1582
+ """
1583
+ from sklearn.preprocessing import MinMaxScaler
1584
+ import seaborn as sns
1585
+ import matplotlib.pyplot as plt
1586
+ from py2ls import plot
1587
+
1588
+ # Check for missing metrics and set default weights if not provided
1589
+ available_metrics = cv_test_scores.columns[1:] # Exclude 'Classifier' column
1590
+ if metric_weights is None:
1591
+ metric_weights = {
1592
+ metric: 1 / len(available_metrics) for metric in available_metrics
1593
+ } # Equal weight if not specified
1594
+ elif metric_weights == "a":
1595
+ metric_weights = {
1596
+ "accuracy": 0.2,
1597
+ "precision": 0.2,
1598
+ "recall": 0.2,
1599
+ "f1": 0.2,
1600
+ "roc_auc": 0.2,
1601
+ }
1602
+ elif metric_weights == "b":
1603
+ metric_weights = {
1604
+ "accuracy": 0.4,
1605
+ "precision": 0.2,
1606
+ "recall": 0.2,
1607
+ "f1": 0.1,
1608
+ "roc_auc": 0.1,
1609
+ }
1610
+ elif metric_weights == "c":
1611
+ metric_weights = {
1612
+ "accuracy": 0.2,
1613
+ "precision": 0.3,
1614
+ "recall": 0.3,
1615
+ "f1": 0.1,
1616
+ "roc_auc": 0.1,
1617
+ }
1618
+ elif metric_weights == "d":
1619
+ metric_weights = {
1620
+ "accuracy": 0.2,
1621
+ "precision": 0.2,
1622
+ "recall": 0.2,
1623
+ "f1": 0.3,
1624
+ "roc_auc": 0.1,
1625
+ }
1626
+ elif metric_weights == "e":
1627
+ metric_weights = {
1628
+ "accuracy": 0.1,
1629
+ "precision": 0.2,
1630
+ "recall": 0.2,
1631
+ "f1": 0.3,
1632
+ "roc_auc": 0.3,
1633
+ }
1634
+ else:
1635
+ metric_weights = {
1636
+ metric: 1 / len(available_metrics) for metric in available_metrics
1637
+ }
1638
+
1639
+ # Normalize weights if they don’t sum to 1
1640
+ total_weight = sum(metric_weights.values())
1641
+ metric_weights = {
1642
+ metric: weight / total_weight for metric, weight in metric_weights.items()
1643
+ }
1644
+ if rm_outlier:
1645
+ cv_test_scores_ = ips.df_outlier(cv_test_scores)
1646
+ else:
1647
+ cv_test_scores_=cv_test_scores
1648
+
1649
+ # Normalize the scores of metrics if normalize is True
1650
+ scaler = MinMaxScaler()
1651
+ normalized_scores = pd.DataFrame(
1652
+ scaler.fit_transform(cv_test_scores_[available_metrics]),
1653
+ columns=available_metrics,
1654
+ )
1655
+ cv_test_scores_ = pd.concat(
1656
+ [cv_test_scores_[["Classifier"]], normalized_scores], axis=1
1657
+ )
1658
+
1659
+ # Calculate weighted scores for each model
1660
+ cv_test_scores_["combined_score"] = sum(
1661
+ cv_test_scores_[metric] * weight for metric, weight in metric_weights.items()
1662
+ )
1663
+ top_models = cv_test_scores_.sort_values(by="combined_score", ascending=False)
1664
+ cv_test_scores = cv_test_scores.loc[top_models.index]
1665
+ top_models.reset_index(drop=True, inplace=True)
1666
+ cv_test_scores.reset_index(drop=True, inplace=True)
1667
+
1668
+ if plot_:
1669
+
1670
+ def generate_bar_plot(ax, cv_test_scores):
1671
+ ax = plot.plotxy(
1672
+ y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
1673
+ )
1674
+ plt.title("Classifier Performance")
1675
+ plt.tight_layout()
1676
+ return plt
1677
+
1678
+ nexttile = plot.subplot(2, 2, figsize=[10, 7])
1679
+ generate_bar_plot(nexttile(), top_models.dropna())
1680
+ plot.radar(
1681
+ ax=nexttile(projection="polar"),
1682
+ data=cv_test_scores.set_index("Classifier"),
1683
+ ylim=[0.5, 1],
1684
+ color=plot.get_color(10),
1685
+ alpha=0.05,
1686
+ circular=1,
1687
+ )
1688
+ return cv_test_scores
1689
+
1690
+
1691
+ # # Example Usage:
1692
+ # metric_weights = {
1693
+ # "accuracy": 0.2,
1694
+ # "precision": 0.3,
1695
+ # "recall": 0.2,
1696
+ # "f1": 0.2,
1697
+ # "roc_auc": 0.1,
1698
+ # }
1699
+ # cv_test_scores = res["cv_test_scores"].copy()
1700
+ # best_model = rank_models(
1701
+ # cv_test_scores, metric_weights=metric_weights, normalize=True, plot_=True
1702
+ # )
1703
+
1704
+ # figsave("classifier_performance.pdf")
1705
+
1706
+ def predict(
1707
+ x_train: pd.DataFrame,
1708
+ y_train: pd.Series,
1709
+ x_true: pd.DataFrame=None,
1710
+ y_true: Optional[pd.Series] = None,
1711
+ common_features: set = None,
1712
+ purpose: str = "classification", # 'classification' or 'regression'
1713
+ cls: Optional[Dict[str, Any]] = None,
1714
+ metrics: Optional[List[str]] = None,
1715
+ random_state: int = 1,
1716
+ smote: bool = False,
1717
+ n_jobs:int = -1,
1718
+ plot_: bool = True,
1719
+ dir_save:str="./",
1720
+ test_size:float=0.2,# specific only when x_true is None
1721
+ cv_folds:int=5,# more cv_folds 得更加稳定,auc可能更低
1722
+ cv_level:str="l",#"s":'low',"m":'medium',"l":"high"
1723
+ class_weight: str = "balanced",
1724
+ verbose:bool=False,
1725
+ ) -> pd.DataFrame:
1726
+ """
1727
+ 第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
1728
+ Usage:
1729
+ (1). predict(x_train, y_train,...) 对 x_train 进行拆分训练/测试集,并在测试集上进行验证.
1730
+ predict 函数会根据 test_size 参数,将 x_train 和 y_train 拆分出内部测试集。然后模型会在拆分出的训练集上进行训练,并在测试集上验证效果。
1731
+ (2). predict(x_train, y_train, x_true,...)使用 x_train 和 y_train 训练并对 x_true 进行预测
1732
+ 由于传入了 x_true,函数会跳过 x_train 的拆分,直接使用全部的 x_train 和 y_train 进行训练。然后对 x_true 进行预测,但由于没有提供 y_true,
1733
+ 因此无法与真实值进行对比。
1734
+ (3). predict(x_train, y_train, x_true, y_true,...)使用 x_train 和 y_train 训练,并验证 x_true 与真实标签 y_true.
1735
+ predict 函数会在 x_train 和 y_train 上进行训练,并将 x_true 作为测试集。由于提供了 y_true,函数可以将预测结果与 y_true 进行对比,从而
1736
+ 计算验证指标,完成对 x_true 的真正验证。
1737
+ trains and validates a variety of machine learning models for both classification and regression tasks.
1738
+ It supports hyperparameter tuning with grid search and includes additional features like cross-validation,
1739
+ feature scaling, and handling of class imbalance through SMOTE.
1740
+
1741
+ Parameters:
1742
+ - x_train (pd.DataFrame):Training feature data, structured with each row as an observation and each column as a feature.
1743
+ - y_train (pd.Series):Target variable for the training dataset.
1744
+ - x_true (pd.DataFrame, optional):Test feature data. If not provided, the function splits x_train based on test_size.
1745
+ - y_true (pd.Series, optional):Test target values. If not provided, y_train is split into training and testing sets.
1746
+ - common_features (set, optional):Specifies a subset of features common across training and test data.
1747
+ - purpose (str, default = "classification"):Defines whether the task is "classification" or "regression". Determines which
1748
+ metrics and models are applied.
1749
+ - cls (dict, optional):Dictionary to specify custom classifiers/regressors. Defaults to a set of common models if not provided.
1750
+ - metrics (list, optional):List of evaluation metrics (like accuracy, F1 score) used for model evaluation.
1751
+ - random_state (int, default = 1):Random seed to ensure reproducibility.
1752
+ - smote (bool, default = False):Applies Synthetic Minority Oversampling Technique (SMOTE) to address class imbalance if enabled.
1753
+ - n_jobs (int, default = -1):Number of parallel jobs for computation. Set to -1 to use all available cores.
1754
+ - plot_ (bool, default = True):If True, generates plots of the model evaluation metrics.
1755
+ - test_size (float, default = 0.2):Test data proportion if x_true is not provided.
1756
+ - cv_folds (int, default = 5):Number of cross-validation folds.
1757
+ - cv_level (str, default = "l"):Sets the detail level of cross-validation. "s" for low, "m" for medium, and "l" for high.
1758
+ - class_weight (str, default = "balanced"):Balances class weights in classification tasks.
1759
+ - verbose (bool, default = False):If True, prints detailed output during model training.
1760
+ - dir_save (str, default = "./"):Directory path to save plot outputs and results.
1761
+
1762
+ Key Steps in the Function:
1763
+ Model Initialization: Depending on purpose, initializes either classification or regression models.
1764
+ Feature Selection: Ensures training and test sets have matching feature columns.
1765
+ SMOTE Application: Balances classes if smote is enabled and the task is classification.
1766
+ Cross-Validation and Hyperparameter Tuning: Utilizes GridSearchCV for model tuning based on cv_level.
1767
+ Evaluation and Plotting: Outputs evaluation metrics like AUC, confusion matrices, and optional plotting of performance metrics.
1768
+ """
1769
+ from tqdm import tqdm
1770
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor
1771
+ from sklearn.svm import SVC, SVR
1772
+ from sklearn.tree import DecisionTreeRegressor
1773
+ from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV, LinearRegression, Lasso,RidgeClassifierCV, Perceptron, SGDClassifier
1774
+ from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
1775
+ from sklearn.naive_bayes import GaussianNB,BernoulliNB
1776
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
1777
+ import xgboost as xgb
1778
+ import lightgbm as lgb
1779
+ import catboost as cb
1780
+ from sklearn.neural_network import MLPClassifier, MLPRegressor
1781
+ from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
1782
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
1783
+ from sklearn.preprocessing import PolynomialFeatures
1784
+
1785
+
1786
+ # 拼写检查
1787
+ purpose=ips.strcmp(purpose,['classification','regression'])[0]
1788
+ print(f"{purpose} processing...")
1789
+ # Default models or regressors if not provided
1790
+ if purpose == "classification":
1791
+ model_ = {
1792
+ "Random Forest": RandomForestClassifier(random_state=random_state, class_weight=class_weight),
1793
+
1794
+ # SVC (Support Vector Classification)
1795
+ "SVM": SVC(kernel="rbf",probability=True,class_weight=class_weight,random_state=random_state),
1796
+
1797
+ # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
1798
+ "Logistic Regression": LogisticRegression(class_weight=class_weight, random_state=random_state),
1799
+
1800
+ # Logistic Regression with L1 Regularization (Lasso)
1801
+ "Lasso Logistic Regression": LogisticRegression(penalty="l1", solver="saga", random_state=random_state),
1802
+ "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
1803
+ "XGBoost": xgb.XGBClassifier(eval_metric="logloss",random_state=random_state,),
1804
+ "KNN": KNeighborsClassifier(n_neighbors=5),
1805
+ "Naive Bayes": GaussianNB(),
1806
+ "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
1807
+ "AdaBoost": AdaBoostClassifier(algorithm='SAMME', random_state=random_state),
1808
+ # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
1809
+ "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
1810
+ "Extra Trees": ExtraTreesClassifier(random_state=random_state, class_weight=class_weight),
1811
+ "Bagging": BaggingClassifier(random_state=random_state),
1812
+ "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
1813
+ "DecisionTree": DecisionTreeClassifier(),
1814
+ "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
1815
+ "Ridge": RidgeClassifierCV(class_weight=class_weight, store_cv_results=True),
1816
+ "Perceptron": Perceptron(random_state=random_state),
1817
+ "Bernoulli Naive Bayes": BernoulliNB(),
1818
+ "SGDClassifier": SGDClassifier(random_state=random_state),
1819
+ }
1820
+ elif purpose == "regression":
1821
+ model_ = {
1822
+ "Random Forest": RandomForestRegressor(random_state=random_state),
1823
+ "SVM": SVR(),# SVR (Support Vector Regression)
1824
+ # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
1825
+ "LassoCV": LassoCV(cv=cv_folds, random_state=random_state),#LassoCV自动找出最适alpha,优于Lasso
1826
+ "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
1827
+ "XGBoost": xgb.XGBRegressor(eval_metric="rmse",random_state=random_state),
1828
+ "Linear Regression": LinearRegression(),
1829
+ "Lasso": Lasso(random_state=random_state),
1830
+ "AdaBoost": AdaBoostRegressor(random_state=random_state),
1831
+ # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
1832
+ "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
1833
+ "Extra Trees": ExtraTreesRegressor(random_state=random_state),
1834
+ "Bagging": BaggingRegressor(random_state=random_state),
1835
+ "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
1836
+ "ElasticNet": ElasticNet(random_state=random_state),
1837
+ "Ridge": Ridge(),
1838
+ "KNN":KNeighborsRegressor()
1839
+ }
1840
+ # indicate cls:
1841
+ if ips.run_once_within(30):# 10 min
1842
+ print(f"supported models: {list(model_.keys())}")
1843
+ if cls is None:
1844
+ models=model_
1845
+ else:
1846
+ if not isinstance(cls, list):
1847
+ cls=[cls]
1848
+ models={}
1849
+ for cls_ in cls:
1850
+ cls_ = ips.strcmp(cls_, list(model_.keys()))[0]
1851
+ models[cls_] = model_[cls_]
1852
+ if 'LightGBM' in models:
1853
+ x_train=ips.df_special_characters_cleaner(x_train)
1854
+ x_true=ips.df_special_characters_cleaner(x_true) if x_true is not None else None
1855
+
1856
+ if isinstance(y_train, str) and y_train in x_train.columns:
1857
+ y_train_col_name=y_train
1858
+ y_train=x_train[y_train]
1859
+ y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy')
1860
+ x_train = x_train.drop(y_train_col_name,axis=1)
1861
+ else:
1862
+ y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy').values.ravel()
1863
+
1864
+ if x_true is None:
1865
+ x_train, x_true, y_train, y_true = train_test_split(
1866
+ x_train,
1867
+ y_train,
1868
+ test_size=test_size,
1869
+ random_state=random_state,
1870
+ stratify=y_train if purpose == "classification" else None
1871
+ )
1872
+ if isinstance(y_train, str) and y_train in x_train.columns:
1873
+ y_train_col_name=y_train
1874
+ y_train=x_train[y_train]
1875
+ y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy')
1876
+ x_train = x_train.drop(y_train_col_name,axis=1)
1877
+ else:
1878
+ y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy').values.ravel()
1879
+ if y_true is not None:
1880
+ if isinstance(y_true, str) and y_true in x_true.columns:
1881
+ y_true_col_name=y_true
1882
+ y_true=x_true[y_true]
1883
+ y_true=ips.df_encoder(pd.DataFrame(y_true),method='dummy')
1884
+ x_true = x_true.drop(y_true_col_name,axis=1)
1885
+ else:
1886
+ y_true=ips.df_encoder(pd.DataFrame(y_true),method='dummy').values.ravel()
1887
+
1888
+ # to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
1889
+
1890
+ # y_train=y_train.values.ravel() if y_train is not None else None
1891
+ # y_true=y_true.values.ravel() if y_true is not None else None
1892
+ y_train = y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
1893
+ y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
1894
+
1895
+
1896
+ # Ensure common features are selected
1897
+ if common_features is not None:
1898
+ x_train, x_true = x_train[common_features], x_true[common_features]
1899
+ else:
1900
+ share_col_names = ips.shared(x_train.columns, x_true.columns,verbose=verbose)
1901
+ x_train, x_true =x_train[share_col_names], x_true[share_col_names]
1902
+
1903
+ x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
1904
+ x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
1905
+ x_true, method="dummy"
1906
+ )
1907
+
1908
+ # Handle class imbalance using SMOTE (only for classification)
1909
+ if (
1910
+ smote
1911
+ and purpose == "classification"
1912
+ and y_train.value_counts(normalize=True).max() < 0.8
1913
+ ):
1914
+ from imblearn.over_sampling import SMOTE
1915
+
1916
+ smote_sampler = SMOTE(random_state=random_state)
1917
+ x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
1918
+
1919
+ # Hyperparameter grids for tuning
1920
+ if cv_level in ["low",'simple','s','l']:
1921
+ param_grids = {
1922
+ "Random Forest": {
1923
+ "n_estimators": [100], # One basic option
1924
+ "max_depth": [None, 10],
1925
+ "min_samples_split": [2],
1926
+ "min_samples_leaf": [1],
1927
+ "class_weight": [None],
1928
+ } if purpose == "classification" else {
1929
+ "n_estimators": [100], # One basic option
1930
+ "max_depth": [None, 10],
1931
+ "min_samples_split": [2],
1932
+ "min_samples_leaf": [1],
1933
+ "max_features": [None],
1934
+ "bootstrap": [True], # Only one option for simplicity
1935
+ },
1936
+ "SVM": {
1937
+ "C": [1],
1938
+ "gamma": ['scale'],
1939
+ "kernel": ['rbf'],
1940
+ },
1941
+ "Lasso": {
1942
+ "alpha": [0.1],
1943
+ },
1944
+ "LassoCV": {
1945
+ "alphas": [[0.1]],
1946
+ },
1947
+ "Logistic Regression": {
1948
+ "C": [1],
1949
+ "solver": ['lbfgs'],
1950
+ "penalty": ['l2'],
1951
+ "max_iter": [500],
1952
+ },
1953
+ "Gradient Boosting": {
1954
+ "n_estimators": [100],
1955
+ "learning_rate": [0.1],
1956
+ "max_depth": [3],
1957
+ "min_samples_split": [2],
1958
+ "subsample": [0.8],
1959
+ },
1960
+ "XGBoost": {
1961
+ "n_estimators": [100],
1962
+ "max_depth": [3],
1963
+ "learning_rate": [0.1],
1964
+ "subsample": [0.8],
1965
+ "colsample_bytree": [0.8],
1966
+ },
1967
+ "KNN": {
1968
+ "n_neighbors": [3],
1969
+ "weights": ['uniform'],
1970
+ "algorithm": ['auto'],
1971
+ "p": [2],
1972
+ } if purpose == 'classification' else {
1973
+ 'n_neighbors': [3],
1974
+ 'weights': ['uniform'],
1975
+ 'metric': ['euclidean'],
1976
+ 'leaf_size': [30],
1977
+ 'p': [2],
1978
+ },
1979
+ "Naive Bayes": {
1980
+ "var_smoothing": [1e-9],
1981
+ },
1982
+ "SVR": {
1983
+ "C": [1],
1984
+ "gamma": ['scale'],
1985
+ "kernel": ['rbf'],
1986
+ },
1987
+ "Linear Regression": {
1988
+ "fit_intercept": [True],
1989
+ },
1990
+ "Extra Trees": {
1991
+ "n_estimators": [100],
1992
+ "max_depth": [None, 10],
1993
+ "min_samples_split": [2],
1994
+ "min_samples_leaf": [1],
1995
+ },
1996
+ "CatBoost": {
1997
+ "iterations": [100],
1998
+ "learning_rate": [0.1],
1999
+ "depth": [3],
2000
+ "l2_leaf_reg": [1],
2001
+ },
2002
+ "LightGBM": {
2003
+ "n_estimators": [100],
2004
+ "num_leaves": [31],
2005
+ "max_depth": [10],
2006
+ 'min_data_in_leaf': [20],
2007
+ 'min_gain_to_split': [0.01],
2008
+ 'scale_pos_weight': [10],
2009
+ },
2010
+ "Bagging": {
2011
+ "n_estimators": [50],
2012
+ "max_samples": [0.7],
2013
+ "max_features": [0.7],
2014
+ },
2015
+ "Neural Network": {
2016
+ "hidden_layer_sizes": [(50,)],
2017
+ "activation": ["relu"],
2018
+ "solver": ["adam"],
2019
+ "alpha": [0.0001],
2020
+ },
2021
+ "Decision Tree": {
2022
+ "max_depth": [None, 10],
2023
+ "min_samples_split": [2],
2024
+ "min_samples_leaf": [1],
2025
+ "criterion": ["gini"],
2026
+ },
2027
+ "AdaBoost": {
2028
+ "n_estimators": [50],
2029
+ "learning_rate": [0.5],
2030
+ },
2031
+ "Linear Discriminant Analysis": {
2032
+ "solver": ["svd"],
2033
+ "shrinkage": [None],
2034
+ },
2035
+ "Quadratic Discriminant Analysis": {
2036
+ 'reg_param': [0.0],
2037
+ 'priors': [None],
2038
+ 'tol': [1e-4],
2039
+ },
2040
+ "Ridge": {'class_weight': [None, 'balanced']} if purpose == "classification" else {
2041
+ 'alpha': [0.1, 1, 10],
2042
+ },
2043
+ "Perceptron": {
2044
+ 'alpha': [1e-3],
2045
+ 'penalty': ['l2'],
2046
+ 'max_iter': [1000],
2047
+ 'eta0': [1.0],
2048
+ },
2049
+ "Bernoulli Naive Bayes": {
2050
+ 'alpha': [0.1, 1, 10],
2051
+ 'binarize': [0.0],
2052
+ 'fit_prior': [True],
2053
+ },
2054
+ "SGDClassifier": {
2055
+ 'eta0': [0.01],
2056
+ 'loss': ['hinge'],
2057
+ 'penalty': ['l2'],
2058
+ 'alpha': [1e-3],
2059
+ 'max_iter': [1000],
2060
+ 'tol': [1e-3],
2061
+ 'random_state': [random_state],
2062
+ 'learning_rate': ['constant'],
2063
+ },
2064
+ }
2065
+ elif cv_level in ['high','advanced','h']:
2066
+ param_grids = {
2067
+ "Random Forest": {
2068
+ "n_estimators": [100, 200, 500, 700, 1000],
2069
+ "max_depth": [None, 3, 5, 10, 15, 20, 30],
2070
+ "min_samples_split": [2, 5, 10, 20],
2071
+ "min_samples_leaf": [1, 2, 4],
2072
+ "class_weight": [None, "balanced"] if purpose == "classification" else {},
2073
+ } if purpose == "classification" else {
2074
+ "n_estimators": [100, 200, 500, 700, 1000],
2075
+ "max_depth": [None, 3, 5, 10, 15, 20, 30],
2076
+ "min_samples_split": [2, 5, 10, 20],
2077
+ "min_samples_leaf": [1, 2, 4],
2078
+ "max_features": ['auto', 'sqrt', 'log2'], # Number of features to consider when looking for the best split
2079
+ "bootstrap": [True, False], # Whether bootstrap samples are used when building trees
2080
+ },
2081
+ "SVM": {
2082
+ "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
2083
+ "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
2084
+ "kernel": ["linear", "rbf", "poly"],
2085
+ },
2086
+ "Logistic Regression": {
2087
+ "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
2088
+ "solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
2089
+ "penalty": ["l1", "l2", "elasticnet"],
2090
+ "max_iter": [100, 200, 300, 500],
2091
+ },
2092
+ "Lasso":{
2093
+ "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
2094
+ "max_iter": [500, 1000, 2000, 5000],
2095
+ "tol": [1e-4, 1e-5, 1e-6],
2096
+ "selection": ["cyclic", "random"]
2097
+ },
2098
+ "LassoCV":{
2099
+ "alphas": [[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
2100
+ "max_iter": [500, 1000, 2000, 5000],
2101
+ "cv": [3, 5, 10],
2102
+ "tol": [1e-4, 1e-5, 1e-6]
2103
+ },
2104
+ "Gradient Boosting": {
2105
+ "n_estimators": [100, 200, 300, 400, 500, 700, 1000],
2106
+ "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3, 0.5],
2107
+ "max_depth": [3, 5, 7, 9, 15],
2108
+ "min_samples_split": [2, 5, 10, 20],
2109
+ "subsample": [0.8, 1.0],
2110
+ },
2111
+ "XGBoost": {
2112
+ "n_estimators": [100, 200, 500, 700],
2113
+ "max_depth": [3, 5, 7, 10],
2114
+ "learning_rate": [0.01, 0.1, 0.2, 0.3],
2115
+ "subsample": [0.8, 1.0],
2116
+ "colsample_bytree": [0.8, 0.9, 1.0],
2117
+ },
2118
+ "KNN": {
2119
+ "n_neighbors": [1, 3, 5, 10, 15, 20],
2120
+ "weights": ["uniform", "distance"],
2121
+ "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
2122
+ "p": [1, 2], # 1 for Manhattan, 2 for Euclidean distance
2123
+ } if purpose=='classification' else {
2124
+ 'n_neighbors': [3, 5, 7, 9, 11], # Number of neighbors
2125
+ 'weights': ['uniform', 'distance'], # Weight function used in prediction
2126
+ 'metric': ['euclidean', 'manhattan', 'minkowski'], # Distance metric
2127
+ 'leaf_size': [20, 30, 40, 50], # Leaf size for KDTree or BallTree algorithms
2128
+ 'p': [1, 2] # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
2129
+ },
2130
+ "Naive Bayes": {
2131
+ "var_smoothing": [1e-10, 1e-9, 1e-8, 1e-7],
2132
+ },
2133
+ "AdaBoost": {
2134
+ "n_estimators": [50, 100, 200, 300, 500],
2135
+ "learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
2136
+ },
2137
+ "SVR": {
2138
+ "C": [0.01, 0.1, 1, 10, 100, 1000],
2139
+ "gamma": [0.001, 0.01, 0.1, "scale", "auto"],
2140
+ "kernel": ["linear", "rbf", "poly"],
2141
+ },
2142
+ "Linear Regression": {
2143
+ "fit_intercept": [True, False],
2144
+ },
2145
+ "Lasso":{
2146
+ "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
2147
+ "max_iter": [1000, 2000] # Higher iteration limit for fine-tuning
2148
+ },
2149
+ "Extra Trees": {
2150
+ "n_estimators": [100, 200, 500, 700, 1000],
2151
+ "max_depth": [None, 5, 10, 15, 20, 30],
2152
+ "min_samples_split": [2, 5, 10, 20],
2153
+ "min_samples_leaf": [1, 2, 4]
2154
+ },
2155
+ "CatBoost": {
2156
+ "iterations": [100, 200, 500],
2157
+ "learning_rate": [0.001, 0.01, 0.1, 0.2],
2158
+ "depth": [3, 5, 7, 10],
2159
+ "l2_leaf_reg": [1, 3, 5, 7, 10],
2160
+ "border_count": [32, 64, 128],
2161
+ },
2162
+ "LightGBM": {
2163
+ "n_estimators": [100, 200, 500, 700, 1000],
2164
+ "learning_rate": [0.001, 0.01, 0.1, 0.2],
2165
+ "num_leaves": [31, 50, 100, 200],
2166
+ "max_depth": [-1, 5, 10, 20, 30],
2167
+ "min_child_samples": [5, 10, 20],
2168
+ "subsample": [0.8, 1.0],
2169
+ "colsample_bytree": [0.8, 0.9, 1.0],
2170
+ },
2171
+ "Neural Network": {
2172
+ "hidden_layer_sizes": [(50,), (100,), (100, 50), (200, 100)],
2173
+ "activation": ["relu", "tanh", "logistic"],
2174
+ "solver": ["adam", "sgd", "lbfgs"],
2175
+ "alpha": [0.0001, 0.001, 0.01],
2176
+ "learning_rate": ["constant", "adaptive"],
2177
+ },
2178
+ "Decision Tree": {
2179
+ "max_depth": [None, 5, 10, 20, 30],
2180
+ "min_samples_split": [2, 5, 10, 20],
2181
+ "min_samples_leaf": [1, 2, 5, 10],
2182
+ "criterion": ["gini", "entropy"],
2183
+ "splitter": ["best", "random"],
2184
+ },
2185
+ "Linear Discriminant Analysis": {
2186
+ "solver": ["svd", "lsqr", "eigen"],
2187
+ "shrinkage": [None, "auto", 0.1, 0.5, 1.0], # shrinkage levels for 'lsqr' and 'eigen'
2188
+ },
2189
+ 'Ridge': {'class_weight': [None, 'balanced']} if purpose == "classification" else {
2190
+ 'alpha': [0.1, 1, 10, 100, 1000],
2191
+ 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'lbfgs'],
2192
+ 'fit_intercept': [True, False], # Whether to calculate the intercept
2193
+ 'normalize': [True, False] # If True, the regressors X will be normalized
2194
+ }
2195
+ }
2196
+ else: # median level
2197
+ param_grids = {
2198
+ "Random Forest": {
2199
+ "n_estimators": [100, 200, 500],
2200
+ "max_depth": [None, 10, 20, 30],
2201
+ "min_samples_split": [2, 5, 10],
2202
+ "min_samples_leaf": [1, 2, 4],
2203
+ "class_weight": [None, "balanced"]
2204
+ } if purpose == "classification" else {
2205
+ "n_estimators": [100, 200, 500],
2206
+ "max_depth": [None, 10, 20, 30],
2207
+ "min_samples_split": [2, 5, 10],
2208
+ "min_samples_leaf": [1, 2, 4],
2209
+ "max_features": ['auto', 'sqrt', 'log2'], # Number of features to consider when looking for the best split
2210
+ "bootstrap": [True, False], # Whether bootstrap samples are used when building trees
2211
+ },
2212
+ "SVM": {
2213
+ "C": [0.1, 1, 10, 100], # Regularization strength
2214
+ "gamma": ['scale', 'auto'], # Common gamma values
2215
+ "kernel": ['rbf', 'linear', 'poly'],
2216
+ },
2217
+ "Logistic Regression": {
2218
+ "C": [0.1, 1, 10, 100], # Regularization strength
2219
+ "solver": ['lbfgs', 'liblinear', 'saga'], # Common solvers
2220
+ "penalty": ['l2'], # L2 penalty is most common
2221
+ "max_iter": [500, 1000, 2000], # Increased max_iter for better convergence
2222
+ },
2223
+ "Lasso":{
2224
+ "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
2225
+ "max_iter": [500, 1000, 2000]
2226
+ },
2227
+ "LassoCV":{
2228
+ "alphas": [[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
2229
+ "max_iter": [500, 1000, 2000]
2230
+ },
2231
+ "Gradient Boosting": {
2232
+ "n_estimators": [100, 200, 500],
2233
+ "learning_rate": [0.01, 0.1, 0.2],
2234
+ "max_depth": [3, 5, 7],
2235
+ "min_samples_split": [2, 5, 10],
2236
+ "subsample": [0.8, 1.0],
2237
+ },
2238
+ "XGBoost": {
2239
+ "n_estimators": [100, 200, 500],
2240
+ "max_depth": [3, 5, 7],
2241
+ "learning_rate": [0.01, 0.1, 0.2],
2242
+ "subsample": [0.8, 1.0],
2243
+ "colsample_bytree": [0.8, 1.0],
2244
+ },
2245
+ "KNN": {
2246
+ "n_neighbors": [3, 5, 7, 10],
2247
+ "weights": ['uniform', 'distance'],
2248
+ "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
2249
+ "p": [1, 2],
2250
+ } if purpose=='classification' else {
2251
+ 'n_neighbors': [3, 5, 7, 9, 11], # Number of neighbors
2252
+ 'weights': ['uniform', 'distance'], # Weight function used in prediction
2253
+ 'metric': ['euclidean', 'manhattan', 'minkowski'], # Distance metric
2254
+ 'leaf_size': [20, 30, 40, 50], # Leaf size for KDTree or BallTree algorithms
2255
+ 'p': [1, 2] # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
2256
+ },
2257
+ "Naive Bayes": {
2258
+ "var_smoothing": [1e-9, 1e-8, 1e-7],
2259
+ },
2260
+ "SVR": {
2261
+ "C": [0.1, 1, 10, 100],
2262
+ "gamma": ['scale', 'auto'],
2263
+ "kernel": ['rbf', 'linear'],
2264
+ },
2265
+ "Linear Regression": {
2266
+ "fit_intercept": [True, False],
2267
+ },
2268
+ "Lasso": {
2269
+ "alpha": [0.1, 1.0, 10.0],
2270
+ "max_iter": [1000, 2000], # Sufficient iterations for convergence
2271
+ },
2272
+ "Extra Trees": {
2273
+ "n_estimators": [100, 200, 500],
2274
+ "max_depth": [None, 10, 20, 30],
2275
+ "min_samples_split": [2, 5, 10],
2276
+ "min_samples_leaf": [1, 2, 4],
2277
+ },
2278
+ "CatBoost": {
2279
+ "iterations": [100, 200],
2280
+ "learning_rate": [0.01, 0.1],
2281
+ "depth": [3, 6, 10],
2282
+ "l2_leaf_reg": [1, 3, 5, 7],
2283
+ },
2284
+ "LightGBM": {
2285
+ "n_estimators": [100, 200, 500],
2286
+ "learning_rate": [0.01, 0.1],
2287
+ "num_leaves": [31, 50, 100],
2288
+ "max_depth": [-1, 10, 20],
2289
+ 'min_data_in_leaf': [20], # Minimum samples in each leaf
2290
+ 'min_gain_to_split': [0.01], # Minimum gain to allow a split
2291
+ 'scale_pos_weight': [10], # Address class imbalance
2292
+ },
2293
+ "Bagging": {
2294
+ "n_estimators": [10, 50, 100],
2295
+ "max_samples": [0.5, 0.7, 1.0],
2296
+ "max_features": [0.5, 0.7, 1.0],
2297
+ },
2298
+ "Neural Network": {
2299
+ "hidden_layer_sizes": [(50,), (100,), (100, 50)],
2300
+ "activation": ["relu", "tanh"],
2301
+ "solver": ["adam", "sgd"],
2302
+ "alpha": [0.0001, 0.001],
2303
+ },
2304
+ "Decision Tree": {
2305
+ "max_depth": [None, 10, 20],
2306
+ "min_samples_split": [2, 10],
2307
+ "min_samples_leaf": [1, 4],
2308
+ "criterion": ["gini", "entropy"],
2309
+ },
2310
+ "AdaBoost": {
2311
+ "n_estimators": [50, 100],
2312
+ "learning_rate": [0.5, 1.0],
2313
+ },
2314
+ "Linear Discriminant Analysis": {
2315
+ "solver": ["svd", "lsqr", "eigen"],
2316
+ "shrinkage": [None, "auto"],
2317
+ }, "Quadratic Discriminant Analysis":{
2318
+ 'reg_param': [0.0, 0.1, 0.5, 1.0], # Regularization parameter
2319
+ 'priors': [None, [0.5, 0.5], [0.3, 0.7]], # Class priors
2320
+ 'tol': [1e-4, 1e-3, 1e-2] # Tolerance value for the convergence of the algorithm
2321
+ },
2322
+ "Perceptron":{
2323
+ 'alpha': [1e-4, 1e-3, 1e-2], # Regularization parameter
2324
+ 'penalty': ['l2', 'l1', 'elasticnet'], # Regularization penalty
2325
+ 'max_iter': [1000, 2000], # Maximum number of iterations
2326
+ 'eta0': [1.0, 0.1], # Learning rate for gradient descent
2327
+ 'tol': [1e-3, 1e-4, 1e-5], # Tolerance for stopping criteria
2328
+ 'random_state': [random_state] # Random state for reproducibility
2329
+ },
2330
+ "Bernoulli Naive Bayes":{
2331
+ 'alpha': [0.1, 1.0, 10.0], # Additive (Laplace) smoothing parameter
2332
+ 'binarize': [0.0, 0.5, 1.0], # Threshold for binarizing the input features
2333
+ 'fit_prior': [True, False] # Whether to learn class prior probabilities
2334
+ },
2335
+ "SGDClassifier":{
2336
+ 'eta0': [0.01, 0.1, 1.0],
2337
+ 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], # Loss function
2338
+ 'penalty': ['l2', 'l1', 'elasticnet'], # Regularization penalty
2339
+ 'alpha': [1e-4, 1e-3, 1e-2], # Regularization strength
2340
+ 'l1_ratio': [0.15, 0.5, 0.85], # L1 ratio for elasticnet penalty
2341
+ 'max_iter': [1000, 2000], # Maximum number of iterations
2342
+ 'tol': [1e-3, 1e-4], # Tolerance for stopping criteria
2343
+ 'random_state': [random_state], # Random state for reproducibility
2344
+ 'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], # Learning rate schedule
2345
+ },
2346
+ 'Ridge': {'class_weight': [None, 'balanced']} if purpose == "classification" else {
2347
+ 'alpha': [0.1, 1, 10, 100],
2348
+ 'solver': ['auto', 'svd', 'cholesky', 'lsqr'] # Solver for optimization
2349
+ }
2350
+ }
2351
+
2352
+ results = {}
2353
+ # Use StratifiedKFold for classification and KFold for regression
2354
+ cv = (
2355
+ StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
2356
+ if purpose == "classification"
2357
+ else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
2358
+ )
2359
+
2360
+ # Train and validate each model
2361
+ for name, clf in tqdm(
2362
+ models.items(),
2363
+ desc="models",
2364
+ colour="green",
2365
+ bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
2366
+ ):
2367
+ if verbose:
2368
+ print(f"\nTraining and validating {name}:")
2369
+
2370
+ # Grid search with KFold or StratifiedKFold
2371
+ gs = GridSearchCV(
2372
+ clf,
2373
+ param_grid=param_grids.get(name, {}),
2374
+ scoring=(
2375
+ "roc_auc" if purpose == "classification" else "neg_mean_squared_error"
2376
+ ),
2377
+ cv=cv,
2378
+ n_jobs=n_jobs,
2379
+ verbose=verbose,
2380
+ )
2381
+ gs.fit(x_train, y_train)
2382
+ best_clf = gs.best_estimator_
2383
+ # make sure x_train and x_test has the same name
2384
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2385
+ y_pred = best_clf.predict(x_true)
2386
+
2387
+ # y_pred_proba
2388
+ if hasattr(best_clf, "predict_proba"):
2389
+ y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
2390
+ elif hasattr(best_clf, "decision_function"):
2391
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
2392
+ y_pred_proba = best_clf.decision_function(x_true)
2393
+ # Ensure y_pred_proba is within 0 and 1 bounds
2394
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
2395
+ y_pred_proba.max() - y_pred_proba.min()
2396
+ )
2397
+ else:
2398
+ y_pred_proba = None # No probability output for certain models
2399
+
2400
+
2401
+ validation_scores = {}
2402
+ if y_true is not None:
2403
+ validation_scores = cal_metrics(y_true, y_pred, y_pred_proba=y_pred_proba, purpose=purpose, average="weighted")
2404
+
2405
+ # Calculate ROC curve
2406
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
2407
+ if y_pred_proba is not None:
2408
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2409
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2410
+ lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba,verbose=False)
2411
+ roc_auc = auc(fpr, tpr)
2412
+ roc_info = {
2413
+ "fpr": fpr.tolist(),
2414
+ "tpr": tpr.tolist(),
2415
+ "auc": roc_auc,
2416
+ "ci95": (lower_ci, upper_ci),
2417
+ }
2418
+ # precision-recall curve
2419
+ precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
2420
+ avg_precision_ = average_precision_score(y_true, y_pred_proba)
2421
+ pr_info = {
2422
+ "precision": precision_,
2423
+ "recall": recall_,
2424
+ "avg_precision": avg_precision_,
2425
+ }
2426
+ else:
2427
+ roc_info, pr_info = None, None
2428
+ if purpose=="classification":
2429
+ results[name] = {
2430
+ "best_clf": gs.best_estimator_,
2431
+ "best_params": gs.best_params_,
2432
+ "auc_indiv":[gs.cv_results_[f'split{i}_test_score'][gs.best_index_] for i in range(cv_folds)],
2433
+ "scores": validation_scores,
2434
+ "roc_curve": roc_info,
2435
+ "pr_curve": pr_info,
2436
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2437
+ "predictions": y_pred.tolist(),
2438
+ "predictions_proba": (
2439
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2440
+ ),
2441
+ }
2442
+ else: # "regression"
2443
+ results[name] = {
2444
+ "best_clf": gs.best_estimator_,
2445
+ "best_params": gs.best_params_,
2446
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2447
+ "predictions": y_pred.tolist(),
2448
+ "predictions_proba": (
2449
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2450
+ ),
2451
+ }
2452
+
2453
+ else:
2454
+ results[name] = {
2455
+ "best_clf": gs.best_estimator_,
2456
+ "best_params": gs.best_params_,
2457
+ "scores": validation_scores,
2458
+ "predictions": y_pred.tolist(),
2459
+ "predictions_proba": (
2460
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2461
+ ),
2462
+ }
2463
+
2464
+ # Convert results to DataFrame
2465
+ df_results = pd.DataFrame.from_dict(results, orient="index")
2466
+
2467
+ # sort
2468
+ if y_true is not None and purpose=="classification":
2469
+ df_scores = pd.DataFrame(
2470
+ df_results["scores"].tolist(), index=df_results["scores"].index
2471
+ ).sort_values(by="roc_auc", ascending=False)
2472
+ df_results=df_results.loc[df_scores.index]
2473
+
2474
+ if plot_:
2475
+ from datetime import datetime
2476
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
2477
+ nexttile=plot.subplot(figsize=[12, 10])
2478
+ plot.heatmap(df_scores, kind="direct",ax=nexttile())
2479
+ plot.figsets(xangle=30)
2480
+ if dir_save:
2481
+ ips.figsave(dir_save+f"scores_sorted_heatmap{now_}.pdf")
2482
+ if df_scores.shape[0]>1:# draw cluster
2483
+ plot.heatmap(df_scores, kind="direct",cluster=True)
2484
+ plot.figsets(xangle=30)
2485
+ if dir_save:
2486
+ ips.figsave(dir_save+f"scores_clus{now_}.pdf")
2487
+ if all([plot_, y_true is not None, purpose=='classification']):
2488
+ try:
2489
+ if len(models)>3:
2490
+ plot_validate_features(df_results)
2491
+ else:
2492
+ plot_validate_features_single(df_results,figsize=(12,4*len(models)))
2493
+ if dir_save:
2494
+ ips.figsave(dir_save+f"validate_features{now_}.pdf")
2495
+ except Exception as e:
2496
+ print(f"Error: 在画图的过程中出现了问题:{e}")
2497
+ return df_results
2498
+
2499
+
2500
+ def cal_metrics(y_true, y_pred, y_pred_proba=None, purpose="regression", average="weighted"):
2501
+ """
2502
+ Calculate regression or classification metrics based on the purpose.
2503
+
2504
+ Parameters:
2505
+ - y_true: Array of true values.
2506
+ - y_pred: Array of predicted labels for classification or predicted values for regression.
2507
+ - y_pred_proba: Array of predicted probabilities for classification (optional).
2508
+ - purpose: str, "regression" or "classification".
2509
+ - average: str, averaging method for multi-class classification ("binary", "micro", "macro", "weighted", etc.).
2510
+
2511
+ Returns:
2512
+ - validation_scores: dict of computed metrics.
2513
+ """
2514
+ from sklearn.metrics import (
2515
+ mean_squared_error,
2516
+ mean_absolute_error,
2517
+ mean_absolute_percentage_error,
2518
+ explained_variance_score,
2519
+ r2_score,
2520
+ mean_squared_log_error,
2521
+ accuracy_score,
2522
+ precision_score,
2523
+ recall_score,
2524
+ f1_score,
2525
+ roc_auc_score,
2526
+ matthews_corrcoef,
2527
+ confusion_matrix,
2528
+ balanced_accuracy_score,
2529
+ average_precision_score,
2530
+ precision_recall_curve
2531
+ )
2532
+ validation_scores = {}
2533
+
2534
+ if purpose == "regression":
2535
+ y_true = np.asarray(y_true)
2536
+ y_true = y_true.ravel()
2537
+ y_pred = np.asarray(y_pred)
2538
+ y_pred = y_pred.ravel()
2539
+ # Regression metrics
2540
+ validation_scores = {
2541
+ "mse": mean_squared_error(y_true, y_pred),
2542
+ "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
2543
+ "mae": mean_absolute_error(y_true, y_pred),
2544
+ "r2": r2_score(y_true, y_pred),
2545
+ "mape": mean_absolute_percentage_error(y_true, y_pred),
2546
+ "explained_variance": explained_variance_score(y_true, y_pred),
2547
+ "mbd": np.mean(y_pred - y_true) # Mean Bias Deviation
2548
+ }
2549
+ # Check if MSLE can be calculated
2550
+ if np.all(y_true >= 0) and np.all(y_pred >= 0): # Ensure no negative values
2551
+ validation_scores["msle"] = mean_squared_log_error(y_true, y_pred)
2552
+ else:
2553
+ validation_scores["msle"] = "Cannot be calculated due to negative values"
2554
+
2555
+ elif purpose == "classification":
2556
+ # Classification metrics
2557
+ validation_scores = {
2558
+ "accuracy": accuracy_score(y_true, y_pred),
2559
+ "precision": precision_score(y_true, y_pred, average=average),
2560
+ "recall": recall_score(y_true, y_pred, average=average),
2561
+ "f1": f1_score(y_true, y_pred, average=average),
2562
+ "mcc": matthews_corrcoef(y_true, y_pred),
2563
+ "specificity": None,
2564
+ "balanced_accuracy": balanced_accuracy_score(y_true, y_pred)
2565
+ }
2566
+
2567
+ # Confusion matrix to calculate specificity
2568
+ tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
2569
+ validation_scores["specificity"] = tn / (tn + fp) if (tn + fp) > 0 else 0 # Specificity calculation
2570
+
2571
+ if y_pred_proba is not None:
2572
+ # Calculate ROC-AUC
2573
+ validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
2574
+ # PR-AUC (Precision-Recall AUC) calculation
2575
+ validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
2576
+ else:
2577
+ raise ValueError("Invalid purpose specified. Choose 'regression' or 'classification'.")
2578
+
2579
+ return validation_scores
2580
+