py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls copy.py DELETED
@@ -1,2906 +0,0 @@
1
- from sklearn.ensemble import (
2
- RandomForestClassifier,
3
- GradientBoostingClassifier,
4
- AdaBoostClassifier,
5
- BaggingClassifier,
6
- )
7
- from sklearn.svm import SVC, SVR
8
- from sklearn.calibration import CalibratedClassifierCV
9
- from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
- from sklearn.linear_model import (
11
- LassoCV,
12
- LogisticRegression,
13
- LinearRegression,
14
- Lasso,
15
- Ridge,
16
- RidgeClassifierCV,
17
- ElasticNet,
18
- )
19
- from sklearn.feature_selection import RFE
20
- from sklearn.naive_bayes import GaussianNB
21
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
22
- import xgboost as xgb # Make sure you have xgboost installed
23
-
24
- from sklearn.model_selection import train_test_split, cross_val_score
25
- from sklearn.metrics import (
26
- accuracy_score,
27
- precision_score,
28
- recall_score,
29
- f1_score,
30
- roc_auc_score,
31
- confusion_matrix,
32
- matthews_corrcoef,
33
- roc_curve,
34
- auc,
35
- balanced_accuracy_score,
36
- precision_recall_curve,
37
- average_precision_score,
38
- )
39
- from imblearn.over_sampling import SMOTE
40
- from sklearn.pipeline import Pipeline
41
- from collections import defaultdict
42
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
43
- from typing import Dict, Any, Optional, List, Union
44
- import numpy as np
45
- import pandas as pd
46
- from . import ips
47
- from . import plot
48
- import matplotlib.pyplot as plt
49
- import seaborn as sns
50
-
51
- plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
52
- import logging
53
- import warnings
54
-
55
- logging.basicConfig(
56
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
57
- )
58
- logger = logging.getLogger()
59
-
60
- # Ignore specific warnings (UserWarning in this case)
61
- warnings.filterwarnings("ignore", category=UserWarning)
62
- from sklearn.tree import DecisionTreeClassifier
63
- from sklearn.neighbors import KNeighborsClassifier
64
-
65
-
66
- def features_knn(
67
- x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
68
- ) -> pd.DataFrame:
69
- """
70
- A distance-based classifier that assigns labels based on the majority label of nearest neighbors.
71
- when to use:
72
- Effective for small to medium datasets with a low number of features.
73
- It does not directly provide feature importances but can be assessed through feature permutation or similar methods.
74
- Recommended Use: Effective for datasets with low feature dimensionality and well-separated clusters.
75
-
76
- Fits KNeighborsClassifier and approximates feature influence using permutation importance.
77
- """
78
- knn = KNeighborsClassifier(**knn_params)
79
- knn.fit(x_train, y_train)
80
- importances = permutation_importance(
81
- knn, x_train, y_train, n_repeats=30, random_state=1, scoring="accuracy"
82
- )
83
- return pd.DataFrame(
84
- {"feature": x_train.columns, "importance": importances.importances_mean}
85
- ).sort_values(by="importance", ascending=False)
86
-
87
-
88
- #! 1. Linear and Regularized Regression Methods
89
- # 1.1 Lasso
90
- def features_lasso(
91
- x_train: pd.DataFrame, y_train: pd.Series, lasso_params: dict
92
- ) -> np.ndarray:
93
- """
94
- Lasso (Least Absolute Shrinkage and Selection Operator):
95
- A regularized linear regression method that uses L1 penalty to shrink coefficients, effectively
96
- performing feature selection by zeroing out less important ones.
97
- """
98
- lasso = LassoCV(**lasso_params)
99
- lasso.fit(x_train, y_train)
100
- # Get non-zero coefficients and their corresponding features
101
- coefficients = lasso.coef_
102
- importance_df = pd.DataFrame(
103
- {"feature": x_train.columns, "importance": np.abs(coefficients)}
104
- )
105
- return importance_df[importance_df["importance"] > 0].sort_values(
106
- by="importance", ascending=False
107
- )
108
-
109
-
110
- # 1.2 Ridge regression
111
- def features_ridge(
112
- x_train: pd.DataFrame, y_train: pd.Series, ridge_params: dict
113
- ) -> np.ndarray:
114
- """
115
- Ridge Regression: A linear regression technique that applies L2 regularization, reducing coefficient
116
- magnitudes to avoid overfitting, especially with multicollinearity among features.
117
- """
118
- from sklearn.linear_model import RidgeCV
119
-
120
- ridge = RidgeCV(**ridge_params)
121
- ridge.fit(x_train, y_train)
122
-
123
- # Get the coefficients
124
- coefficients = ridge.coef_
125
-
126
- # Create a DataFrame to hold feature importance
127
- importance_df = pd.DataFrame(
128
- {"feature": x_train.columns, "importance": np.abs(coefficients)}
129
- )
130
- return importance_df[importance_df["importance"] > 0].sort_values(
131
- by="importance", ascending=False
132
- )
133
-
134
-
135
- # 1.3 Elastic Net(Enet)
136
- def features_enet(
137
- x_train: pd.DataFrame, y_train: pd.Series, enet_params: dict
138
- ) -> np.ndarray:
139
- """
140
- Elastic Net (Enet): Combines L1 and L2 penalties (lasso and ridge) in a linear model, beneficial
141
- when features are highly correlated or for datasets with more features than samples.
142
- """
143
- from sklearn.linear_model import ElasticNetCV
144
-
145
- enet = ElasticNetCV(**enet_params)
146
- enet.fit(x_train, y_train)
147
- # Get the coefficients
148
- coefficients = enet.coef_
149
- # Create a DataFrame to hold feature importance
150
- importance_df = pd.DataFrame(
151
- {"feature": x_train.columns, "importance": np.abs(coefficients)}
152
- )
153
- return importance_df[importance_df["importance"] > 0].sort_values(
154
- by="importance", ascending=False
155
- )
156
-
157
-
158
- # 1.4 Partial Least Squares Regression for Generalized Linear Models (plsRglm): Combines regression and
159
- # feature reduction, useful for high-dimensional data with correlated features, such as genomics.
160
-
161
- #! 2.Generalized Linear Models and Extensions
162
- # 2.1
163
-
164
-
165
- #!3.Tree-Based and Ensemble Methods
166
- # 3.1 Random Forest(RF)
167
- def features_rf(
168
- x_train: pd.DataFrame, y_train: pd.Series, rf_params: dict
169
- ) -> np.ndarray:
170
- """
171
- An ensemble of decision trees that combines predictions from multiple trees for classification or
172
- regression, effective with high-dimensional, complex datasets.
173
- when to use:
174
- Handles high-dimensional data well.
175
- Robust to overfitting due to averaging of multiple trees.
176
- Provides feature importance, which can help in understanding the influence of different genes.
177
- Fit Random Forest and return sorted feature importances.
178
- Recommended Use: Great for classification problems, especially when you have many features (genes).
179
- """
180
- rf = RandomForestClassifier(**rf_params)
181
- rf.fit(x_train, y_train)
182
- return pd.DataFrame(
183
- {"feature": x_train.columns, "importance": rf.featuress_}
184
- ).sort_values(by="importance", ascending=False)
185
-
186
-
187
- # 3.2 Gradient Boosting Trees
188
- def features_gradient_boosting(
189
- x_train: pd.DataFrame, y_train: pd.Series, gb_params: dict
190
- ) -> pd.DataFrame:
191
- """
192
- An ensemble of decision trees that combines predictions from multiple trees for classification or regression, effective with
193
- high-dimensional, complex datasets.
194
- Gradient Boosting
195
- Strengths:
196
- High predictive accuracy and works well for both classification and regression.
197
- Can handle a mixture of numerical and categorical features.
198
- Recommended Use:
199
- Effective for complex relationships and when you need a powerful predictive model.
200
- Fit Gradient Boosting classifier and return sorted feature importances.
201
- Recommended Use: Effective for complex datasets with many features (genes).
202
- """
203
- gb = GradientBoostingClassifier(**gb_params)
204
- gb.fit(x_train, y_train)
205
- return pd.DataFrame(
206
- {"feature": x_train.columns, "importance": gb.feature_importances_}
207
- ).sort_values(by="importance", ascending=False)
208
-
209
-
210
- # 3.3 XGBoost
211
- def features_xgb(
212
- x_train: pd.DataFrame, y_train: pd.Series, xgb_params: dict
213
- ) -> pd.DataFrame:
214
- """
215
- XGBoost: An advanced gradient boosting technique, faster and more efficient than GBM, with excellent predictive performance on structured data.
216
- """
217
- import xgboost as xgb
218
-
219
- xgb_model = xgb.XGBClassifier(**xgb_params)
220
- xgb_model.fit(x_train, y_train)
221
- return pd.DataFrame(
222
- {"feature": x_train.columns, "importance": xgb_model.feature_importances_}
223
- ).sort_values(by="importance", ascending=False)
224
-
225
-
226
- # 3.4.decision tree
227
- def features_decision_tree(
228
- x_train: pd.DataFrame, y_train: pd.Series, dt_params: dict
229
- ) -> pd.DataFrame:
230
- """
231
- A single decision tree classifier effective for identifying key decision boundaries in data.
232
- when to use:
233
- Good for capturing non-linear patterns.
234
- Provides feature importance scores for each feature, though it may overfit on small datasets.
235
- Efficient for low to medium-sized datasets, where interpretability of decisions is key.
236
- Recommended Use: Useful for interpretable feature importance analysis in smaller or balanced datasets.
237
-
238
- Fits DecisionTreeClassifier and returns sorted feature importances.
239
- """
240
- dt = DecisionTreeClassifier(**dt_params)
241
- dt.fit(x_train, y_train)
242
- return pd.DataFrame(
243
- {"feature": x_train.columns, "importance": dt.feature_importances_}
244
- ).sort_values(by="importance", ascending=False)
245
-
246
-
247
- # 3.5 bagging
248
- def features_bagging(
249
- x_train: pd.DataFrame, y_train: pd.Series, bagging_params: dict
250
- ) -> pd.DataFrame:
251
- """
252
- A bagging ensemble of models, often used with weak learners like decision trees, to reduce variance.
253
- when to use:
254
- Helps reduce overfitting, especially on high-variance models.
255
- Effective when the dataset has numerous features and may benefit from ensemble stability.
256
- Recommended Use: Beneficial for high-dimensional or noisy datasets needing ensemble stability.
257
-
258
- Fits BaggingClassifier and returns averaged feature importances from underlying estimators if available.
259
- """
260
- bagging = BaggingClassifier(**bagging_params)
261
- bagging.fit(x_train, y_train)
262
-
263
- # Calculate feature importance by averaging importances across estimators, if feature_importances_ is available.
264
- if hasattr(bagging.estimators_[0], "feature_importances_"):
265
- importances = np.mean(
266
- [estimator.feature_importances_ for estimator in bagging.estimators_],
267
- axis=0,
268
- )
269
- return pd.DataFrame(
270
- {"feature": x_train.columns, "importance": importances}
271
- ).sort_values(by="importance", ascending=False)
272
- else:
273
- # If the base estimator does not support feature importances, fallback to permutation importance.
274
- importances = permutation_importance(
275
- bagging, x_train, y_train, n_repeats=30, random_state=1, scoring="accuracy"
276
- )
277
- return pd.DataFrame(
278
- {"feature": x_train.columns, "importance": importances.importances_mean}
279
- ).sort_values(by="importance", ascending=False)
280
-
281
-
282
- #! 4.Support Vector Machines
283
- def features_svm(
284
- x_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict
285
- ) -> np.ndarray:
286
- """
287
- Suitable for classification tasks where the number of features is much larger than the number of samples.
288
- 1. Effective in high-dimensional spaces and with clear margin of separation.
289
- 2. Works well for both linear and non-linear classification (using kernel functions).
290
- Select features using RFE with SVM.When combined with SVM, RFE selects features that are most critical for the decision boundary,
291
- helping reduce the dataset to a more manageable size without losing much predictive power.
292
- SVM (Support Vector Machines),supports various kernels (linear, rbf, poly, and sigmoid), is good at handling high-dimensional
293
- data and finding an optimal decision boundary between classes, especially when using the right kernel.
294
- kernel: ["linear", "rbf", "poly", "sigmoid"]
295
- 'linear': simplest kernel that attempts to separate data by drawing a straight line (or hyperplane) between classes. It is effective
296
- when the data is linearly separable, meaning the classes can be well divided by a straight boundary.
297
- Advantages:
298
- - Computationally efficient for large datasets.
299
- - Works well when the number of features is high, which is common in genomic data where you may have thousands of genes
300
- as features.
301
- 'rbf': a nonlinear kernel that maps the input data into a higher-dimensional space to find a decision boundary. It works well for
302
- data that is not linearly separable in its original space.
303
- Advantages:
304
- - Handles nonlinear relationships between features and classes
305
- - Often better than a linear kernel when there is no clear linear decision boundary in the data.
306
- 'poly': Polynomial Kernel: computes similarity between data points based on polynomial functions of the input features. It can model
307
- interactions between features to a certain degree, depending on the polynomial degree chosen.
308
- Advantages:
309
- - Allows modeling of feature interactions.
310
- - Can fit more complex relationships compared to linear models.
311
- 'sigmoid': similar to the activation function in neural networks, and it works well when the data follows an S-shaped decision boundary.
312
- Advantages:
313
- - Can approximate the behavior of neural networks.
314
- - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
315
- S-shaped relationships.
316
- """
317
- # SVM (Support Vector Machines)
318
- svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
319
- # RFE(Recursive Feature Elimination)
320
- selector = RFE(svc, n_features_to_select=rfe_params["n_features_to_select"])
321
- selector.fit(x_train, y_train)
322
- return x_train.columns[selector.support_]
323
-
324
-
325
- #! 5.Bayesian and Probabilistic Methods
326
- def features_naive_bayes(x_train: pd.DataFrame, y_train: pd.Series) -> list:
327
- """
328
- Naive Bayes: A probabilistic classifier based on Bayes' theorem, assuming independence between features, simple and fast, especially
329
- effective for text classification and other high-dimensional data.
330
- """
331
- from sklearn.naive_bayes import GaussianNB
332
-
333
- nb = GaussianNB()
334
- nb.fit(x_train, y_train)
335
- probabilities = nb.predict_proba(x_train)
336
- # Limit the number of features safely, choosing the lesser of half the features or all columns
337
- n_features = min(x_train.shape[1] // 2, len(x_train.columns))
338
-
339
- # Sort probabilities, then map to valid column indices
340
- sorted_indices = np.argsort(probabilities.max(axis=1))[:n_features]
341
-
342
- # Ensure indices are within the column bounds of x_train
343
- valid_indices = sorted_indices[sorted_indices < len(x_train.columns)]
344
-
345
- return x_train.columns[valid_indices]
346
-
347
-
348
- #! 6.Linear Discriminant Analysis (LDA)
349
- def features_lda(x_train: pd.DataFrame, y_train: pd.Series) -> list:
350
- """
351
- Linear Discriminant Analysis (LDA): Projects data onto a lower-dimensional space to maximize class separability, often used as a dimensionality
352
- reduction technique before classification on high-dimensional data.
353
- """
354
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
355
-
356
- lda = LinearDiscriminantAnalysis()
357
- lda.fit(x_train, y_train)
358
- coef = lda.coef_.flatten()
359
- # Create a DataFrame to hold feature importance
360
- importance_df = pd.DataFrame(
361
- {"feature": x_train.columns, "importance": np.abs(coef)}
362
- )
363
-
364
- return importance_df[importance_df["importance"] > 0].sort_values(
365
- by="importance", ascending=False
366
- )
367
-
368
-
369
- def features_adaboost(
370
- x_train: pd.DataFrame, y_train: pd.Series, adaboost_params: dict
371
- ) -> pd.DataFrame:
372
- """
373
- AdaBoost
374
- Strengths:
375
- Combines multiple weak learners to create a strong classifier.
376
- Focuses on examples that are hard to classify, improving overall performance.
377
- Recommended Use:
378
- Can be effective for boosting weak models in a genomics context.
379
- Fit AdaBoost classifier and return sorted feature importances.
380
- Recommended Use: Great for classification problems with a large number of features (genes).
381
- """
382
- ada = AdaBoostClassifier(**adaboost_params)
383
- ada.fit(x_train, y_train)
384
- return pd.DataFrame(
385
- {"feature": x_train.columns, "importance": ada.feature_importances_}
386
- ).sort_values(by="importance", ascending=False)
387
-
388
-
389
- import torch
390
- import torch.nn as nn
391
- import torch.optim as optim
392
- from torch.utils.data import DataLoader, TensorDataset
393
- from skorch import NeuralNetClassifier # sklearn compatible
394
-
395
-
396
- class DNNClassifier(nn.Module):
397
- def __init__(self, input_dim, hidden_dim=128, output_dim=2, dropout_rate=0.5):
398
- super(DNNClassifier, self).__init__()
399
-
400
- self.hidden_layer1 = nn.Sequential(
401
- nn.Linear(input_dim, hidden_dim),
402
- nn.ReLU(),
403
- nn.Dropout(dropout_rate),
404
- nn.Linear(hidden_dim, hidden_dim),
405
- nn.ReLU(),
406
- )
407
-
408
- self.hidden_layer2 = nn.Sequential(
409
- nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(dropout_rate)
410
- )
411
-
412
- # Adding a residual connection between hidden layers
413
- self.residual = nn.Linear(input_dim, hidden_dim)
414
-
415
- self.output_layer = nn.Sequential(
416
- nn.Linear(hidden_dim, output_dim), nn.Softmax(dim=1)
417
- )
418
-
419
- def forward(self, x):
420
- residual = self.residual(x)
421
- x = self.hidden_layer1(x)
422
- x = x + residual # Residual connection
423
- x = self.hidden_layer2(x)
424
- x = self.output_layer(x)
425
- return x
426
-
427
-
428
- def validate_classifier(
429
- clf,
430
- x_train: pd.DataFrame,
431
- y_train: pd.Series,
432
- x_test: pd.DataFrame,
433
- y_test: pd.Series,
434
- metrics: list = ["accuracy", "precision", "recall", "f1", "roc_auc"],
435
- cv_folds: int = 5,
436
- ) -> dict:
437
- """
438
- Perform cross-validation for a given classifier and return average scores for specified metrics on training data.
439
- Then fit the best model on the full training data and evaluate it on the test set.
440
-
441
- Parameters:
442
- - clf: The classifier to be validated.
443
- - x_train: Training features.
444
- - y_train: Training labels.
445
- - x_test: Test features.
446
- - y_test: Test labels.
447
- - metrics: List of metrics to evaluate (e.g., ['accuracy', 'roc_auc']).
448
- - cv_folds: Number of cross-validation folds.
449
-
450
- Returns:
451
- - results: Dictionary containing average cv_train_scores and cv_test_scores.
452
- """
453
- cv_train_scores = {metric: [] for metric in metrics}
454
- skf = StratifiedKFold(n_splits=cv_folds)
455
- # Perform cross-validation
456
- for metric in metrics:
457
- try:
458
- if metric == "roc_auc" and len(set(y_train)) == 2:
459
- scores = cross_val_score(
460
- clf, x_train, y_train, cv=skf, scoring="roc_auc"
461
- )
462
- cv_train_scores[metric] = (
463
- np.nanmean(scores) if not np.isnan(scores).all() else float("nan")
464
- )
465
- else:
466
- score = cross_val_score(clf, x_train, y_train, cv=skf, scoring=metric)
467
- cv_train_scores[metric] = score.mean()
468
- except Exception as e:
469
- cv_train_scores[metric] = float("nan")
470
- clf.fit(x_train, y_train)
471
-
472
- # Evaluate on the test set
473
- cv_test_scores = {}
474
- for metric in metrics:
475
- if metric == "roc_auc" and len(set(y_test)) == 2:
476
- try:
477
- y_prob = clf.predict_proba(x_test)[:, 1]
478
- cv_test_scores[metric] = roc_auc_score(y_test, y_prob)
479
- except AttributeError:
480
- cv_test_scores[metric] = float("nan")
481
- else:
482
- score_func = globals().get(
483
- f"{metric}_score"
484
- ) # Fetching the appropriate scoring function
485
- if score_func:
486
- try:
487
- y_pred = clf.predict(x_test)
488
- cv_test_scores[metric] = score_func(y_test, y_pred)
489
- except Exception as e:
490
- cv_test_scores[metric] = float("nan")
491
-
492
- # Combine results
493
- results = {"cv_train_scores": cv_train_scores, "cv_test_scores": cv_test_scores}
494
- return results
495
-
496
-
497
- def get_models(
498
- random_state=1,
499
- cls=[
500
- "lasso",
501
- "ridge",
502
- "Elastic Net(Enet)",
503
- "gradient Boosting",
504
- "Random forest (rf)",
505
- "XGBoost (xgb)",
506
- "Support Vector Machine(svm)",
507
- "naive bayes",
508
- "Linear Discriminant Analysis (lda)",
509
- "adaboost",
510
- "DecisionTree",
511
- "KNeighbors",
512
- "Bagging",
513
- ],
514
- ):
515
- from sklearn.ensemble import (
516
- RandomForestClassifier,
517
- GradientBoostingClassifier,
518
- AdaBoostClassifier,
519
- BaggingClassifier,
520
- )
521
- from sklearn.svm import SVC
522
- from sklearn.linear_model import (
523
- LogisticRegression,
524
- Lasso,
525
- RidgeClassifierCV,
526
- ElasticNet,
527
- )
528
- from sklearn.naive_bayes import GaussianNB
529
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
530
- import xgboost as xgb
531
- from sklearn.tree import DecisionTreeClassifier
532
- from sklearn.neighbors import KNeighborsClassifier
533
-
534
- res_cls = {}
535
- model_all = {
536
- "Lasso": LogisticRegression(
537
- penalty="l1", solver="saga", random_state=random_state
538
- ),
539
- "Ridge": RidgeClassifierCV(),
540
- "Elastic Net (Enet)": ElasticNet(random_state=random_state),
541
- "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
542
- "Random Forest (RF)": RandomForestClassifier(random_state=random_state),
543
- "XGBoost (XGB)": xgb.XGBClassifier(random_state=random_state),
544
- "Support Vector Machine (SVM)": SVC(kernel="rbf", probability=True),
545
- "Naive Bayes": GaussianNB(),
546
- "Linear Discriminant Analysis (LDA)": LinearDiscriminantAnalysis(),
547
- "AdaBoost": AdaBoostClassifier(random_state=random_state, algorithm="SAMME"),
548
- "DecisionTree": DecisionTreeClassifier(),
549
- "KNeighbors": KNeighborsClassifier(n_neighbors=5),
550
- "Bagging": BaggingClassifier(),
551
- }
552
- print("Using default models:")
553
- for cls_name in cls:
554
- cls_name = ips.strcmp(cls_name, list(model_all.keys()))[0]
555
- res_cls[cls_name] = model_all[cls_name]
556
- print(f"- {cls_name}")
557
- return res_cls
558
-
559
-
560
- def get_features(
561
- X: Union[pd.DataFrame, np.ndarray], # n_samples X n_features
562
- y: Union[pd.Series, np.ndarray, list], # n_samples X n_features
563
- test_size: float = 0.2,
564
- random_state: int = 1,
565
- n_features: int = 10,
566
- fill_missing=True,
567
- rf_params: Optional[Dict] = None,
568
- rfe_params: Optional[Dict] = None,
569
- lasso_params: Optional[Dict] = None,
570
- ridge_params: Optional[Dict] = None,
571
- enet_params: Optional[Dict] = None,
572
- gb_params: Optional[Dict] = None,
573
- adaboost_params: Optional[Dict] = None,
574
- xgb_params: Optional[Dict] = None,
575
- dt_params: Optional[Dict] = None,
576
- bagging_params: Optional[Dict] = None,
577
- knn_params: Optional[Dict] = None,
578
- cls: list = [
579
- "lasso",
580
- "ridge",
581
- "Elastic Net(Enet)",
582
- "gradient Boosting",
583
- "Random forest (rf)",
584
- "XGBoost (xgb)",
585
- "Support Vector Machine(svm)",
586
- "naive bayes",
587
- "Linear Discriminant Analysis (lda)",
588
- "adaboost",
589
- "DecisionTree",
590
- "KNeighbors",
591
- "Bagging",
592
- ],
593
- metrics: Optional[List[str]] = None,
594
- cv_folds: int = 5,
595
- strict: bool = False,
596
- n_shared: int = 2, # 只要有两个方法有重合,就纳入common genes
597
- use_selected_features: bool = True,
598
- plot_: bool = True,
599
- dir_save: str = "./",
600
- ) -> dict:
601
- """
602
- Master function to perform feature selection and validate models.
603
- """
604
- from sklearn.compose import ColumnTransformer
605
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
606
-
607
- # Ensure X and y are DataFrames/Series for consistency
608
- if isinstance(X, np.ndarray):
609
- X = pd.DataFrame(X)
610
- if isinstance(y, (np.ndarray, list)):
611
- y = pd.Series(y)
612
-
613
- # fill na
614
- if fill_missing:
615
- ips.df_fillna(data=X, method="knn", inplace=True, axis=0)
616
- if isinstance(y, str) and y in X.columns:
617
- y_col_name = y
618
- y = X[y]
619
- y = ips.df_encoder(pd.DataFrame(y), method="dummy")
620
- X = X.drop(y_col_name, axis=1)
621
- else:
622
- y = ips.df_encoder(pd.DataFrame(y), method="dummy").values.ravel()
623
- y = y.loc[X.index] # Align y with X after dropping rows with missing values in X
624
- y = y.ravel() if isinstance(y, np.ndarray) else y.values.ravel()
625
-
626
- if X.shape[0] != len(y):
627
- raise ValueError("X and y must have the same number of samples (rows).")
628
-
629
- # #! # Check for non-numeric columns in X and apply one-hot encoding if needed
630
- # Check if any column in X is non-numeric
631
- if any(not np.issubdtype(dtype, np.number) for dtype in X.dtypes):
632
- X = pd.get_dummies(X, drop_first=True)
633
- print(X.shape)
634
-
635
- # #!alternative: # Identify categorical and numerical columns
636
- # categorical_cols = X.select_dtypes(include=["object", "category"]).columns
637
- # numerical_cols = X.select_dtypes(include=["number"]).columns
638
-
639
- # # Define preprocessing pipeline
640
- # preprocessor = ColumnTransformer(
641
- # transformers=[
642
- # ("num", StandardScaler(), numerical_cols),
643
- # ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
644
- # ]
645
- # )
646
- # # Preprocess the data
647
- # X = preprocessor.fit_transform(X)
648
-
649
- # Split data into training and test sets
650
- x_train, x_test, y_train, y_test = train_test_split(
651
- X, y, test_size=test_size, random_state=random_state
652
- )
653
- # Standardize features
654
- scaler = StandardScaler()
655
- x_train_scaled = scaler.fit_transform(x_train)
656
- x_test_scaled = scaler.transform(x_test)
657
-
658
- # Convert back to DataFrame for consistency
659
- x_train = pd.DataFrame(x_train_scaled, columns=x_train.columns)
660
- x_test = pd.DataFrame(x_test_scaled, columns=x_test.columns)
661
-
662
- rf_defaults = {"n_estimators": 100, "random_state": random_state}
663
- rfe_defaults = {"kernel": "linear", "n_features_to_select": n_features}
664
- lasso_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
665
- ridge_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
666
- enet_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
667
- xgb_defaults = {
668
- "n_estimators": 100,
669
- "use_label_encoder": False,
670
- "eval_metric": "logloss",
671
- "random_state": random_state,
672
- }
673
- gb_defaults = {"n_estimators": 100, "random_state": random_state}
674
- adaboost_defaults = {"n_estimators": 50, "random_state": random_state}
675
- dt_defaults = {"max_depth": None, "random_state": random_state}
676
- bagging_defaults = {"n_estimators": 50, "random_state": random_state}
677
- knn_defaults = {"n_neighbors": 5}
678
- rf_params, rfe_params = rf_params or rf_defaults, rfe_params or rfe_defaults
679
- lasso_params, ridge_params = (
680
- lasso_params or lasso_defaults,
681
- ridge_params or ridge_defaults,
682
- )
683
- enet_params, xgb_params = enet_params or enet_defaults, xgb_params or xgb_defaults
684
- gb_params, adaboost_params = (
685
- gb_params or gb_defaults,
686
- adaboost_params or adaboost_defaults,
687
- )
688
- dt_params = dt_params or dt_defaults
689
- bagging_params = bagging_params or bagging_defaults
690
- knn_params = knn_params or knn_defaults
691
-
692
- cls_ = [
693
- "lasso",
694
- "ridge",
695
- "Elastic Net(Enet)",
696
- "Gradient Boosting",
697
- "Random Forest (rf)",
698
- "XGBoost (xgb)",
699
- "Support Vector Machine(svm)",
700
- "Naive Bayes",
701
- "Linear Discriminant Analysis (lda)",
702
- "adaboost",
703
- ]
704
- cls = [ips.strcmp(i, cls_)[0] for i in cls]
705
-
706
- # Lasso Feature Selection
707
- lasso_importances = (
708
- features_lasso(x_train, y_train, lasso_params)
709
- if "lasso" in cls
710
- else pd.DataFrame()
711
- )
712
- lasso_selected_features = (
713
- lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
714
- )
715
- # Ridge
716
- ridge_importances = (
717
- features_ridge(x_train, y_train, ridge_params)
718
- if "ridge" in cls
719
- else pd.DataFrame()
720
- )
721
- selected_ridge_features = (
722
- ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
723
- )
724
- # Elastic Net
725
- enet_importances = (
726
- features_enet(x_train, y_train, enet_params)
727
- if "Enet" in cls
728
- else pd.DataFrame()
729
- )
730
- selected_enet_features = (
731
- enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
732
- )
733
- # Random Forest Feature Importance
734
- rf_importances = (
735
- features_rf(x_train, y_train, rf_params)
736
- if "Random Forest" in cls
737
- else pd.DataFrame()
738
- )
739
- top_rf_features = (
740
- rf_importances.head(n_features)["feature"].values
741
- if "Random Forest" in cls
742
- else []
743
- )
744
- # Gradient Boosting Feature Importance
745
- gb_importances = (
746
- features_gradient_boosting(x_train, y_train, gb_params)
747
- if "Gradient Boosting" in cls
748
- else pd.DataFrame()
749
- )
750
- top_gb_features = (
751
- gb_importances.head(n_features)["feature"].values
752
- if "Gradient Boosting" in cls
753
- else []
754
- )
755
- # xgb
756
- xgb_importances = (
757
- features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
758
- )
759
- top_xgb_features = (
760
- xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
761
- )
762
-
763
- # SVM with RFE
764
- selected_svm_features = (
765
- features_svm(x_train, y_train, rfe_params) if "svm" in cls else []
766
- )
767
- # Naive Bayes
768
- selected_naive_bayes_features = (
769
- features_naive_bayes(x_train, y_train) if "Naive Bayes" in cls else []
770
- )
771
- # lda: linear discriminant analysis
772
- lda_importances = features_lda(x_train, y_train) if "lda" in cls else pd.DataFrame()
773
- selected_lda_features = (
774
- lda_importances.head(n_features)["feature"].values if "lda" in cls else []
775
- )
776
- # AdaBoost Feature Importance
777
- adaboost_importances = (
778
- features_adaboost(x_train, y_train, adaboost_params)
779
- if "AdaBoost" in cls
780
- else pd.DataFrame()
781
- )
782
- top_adaboost_features = (
783
- adaboost_importances.head(n_features)["feature"].values
784
- if "AdaBoost" in cls
785
- else []
786
- )
787
- # Decision Tree Feature Importance
788
- dt_importances = (
789
- features_decision_tree(x_train, y_train, dt_params)
790
- if "Decision Tree" in cls
791
- else pd.DataFrame()
792
- )
793
- top_dt_features = (
794
- dt_importances.head(n_features)["feature"].values
795
- if "Decision Tree" in cls
796
- else []
797
- )
798
- # Bagging Feature Importance
799
- bagging_importances = (
800
- features_bagging(x_train, y_train, bagging_params)
801
- if "Bagging" in cls
802
- else pd.DataFrame()
803
- )
804
- top_bagging_features = (
805
- bagging_importances.head(n_features)["feature"].values
806
- if "Bagging" in cls
807
- else []
808
- )
809
- # KNN Feature Importance via Permutation
810
- knn_importances = (
811
- features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
812
- )
813
- top_knn_features = (
814
- knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
815
- )
816
-
817
- #! Find common features
818
- common_features = ips.shared(
819
- lasso_selected_features,
820
- selected_ridge_features,
821
- selected_enet_features,
822
- top_rf_features,
823
- top_gb_features,
824
- top_xgb_features,
825
- selected_svm_features,
826
- selected_naive_bayes_features,
827
- selected_lda_features,
828
- top_adaboost_features,
829
- top_dt_features,
830
- top_bagging_features,
831
- top_knn_features,
832
- strict=strict,
833
- n_shared=n_shared,
834
- verbose=False,
835
- )
836
-
837
- # Use selected features or all features for model validation
838
- x_train_selected = (
839
- x_train[list(common_features)] if use_selected_features else x_train
840
- )
841
- x_test_selected = x_test[list(common_features)] if use_selected_features else x_test
842
-
843
- if metrics is None:
844
- metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
845
-
846
- # Prepare results DataFrame for selected features
847
- features_df = pd.DataFrame(
848
- {
849
- "type": ["Lasso"] * len(lasso_selected_features)
850
- + ["Ridge"] * len(selected_ridge_features)
851
- + ["Random Forest"] * len(top_rf_features)
852
- + ["Gradient Boosting"] * len(top_gb_features)
853
- + ["Enet"] * len(selected_enet_features)
854
- + ["xgb"] * len(top_xgb_features)
855
- + ["SVM"] * len(selected_svm_features)
856
- + ["Naive Bayes"] * len(selected_naive_bayes_features)
857
- + ["Linear Discriminant Analysis"] * len(selected_lda_features)
858
- + ["AdaBoost"] * len(top_adaboost_features)
859
- + ["Decision Tree"] * len(top_dt_features)
860
- + ["Bagging"] * len(top_bagging_features)
861
- + ["KNN"] * len(top_knn_features),
862
- "feature": np.concatenate(
863
- [
864
- lasso_selected_features,
865
- selected_ridge_features,
866
- top_rf_features,
867
- top_gb_features,
868
- selected_enet_features,
869
- top_xgb_features,
870
- selected_svm_features,
871
- selected_naive_bayes_features,
872
- selected_lda_features,
873
- top_adaboost_features,
874
- top_dt_features,
875
- top_bagging_features,
876
- top_knn_features,
877
- ]
878
- ),
879
- }
880
- )
881
-
882
- #! Validate trained each classifier
883
- models = get_models(random_state=random_state, cls=cls)
884
- cv_train_results, cv_test_results = [], []
885
- for name, clf in models.items():
886
- if not x_train_selected.empty:
887
- cv_scores = validate_classifier(
888
- clf,
889
- x_train_selected,
890
- y_train,
891
- x_test_selected,
892
- y_test,
893
- metrics=metrics,
894
- cv_folds=cv_folds,
895
- )
896
-
897
- cv_train_score_df = pd.DataFrame(cv_scores["cv_train_scores"], index=[name])
898
- cv_test_score_df = pd.DataFrame(cv_scores["cv_test_scores"], index=[name])
899
- cv_train_results.append(cv_train_score_df)
900
- cv_test_results.append(cv_test_score_df)
901
- if all([cv_train_results, cv_test_results]):
902
- cv_train_results_df = (
903
- pd.concat(cv_train_results)
904
- .reset_index()
905
- .rename(columns={"index": "Classifier"})
906
- )
907
- cv_test_results_df = (
908
- pd.concat(cv_test_results)
909
- .reset_index()
910
- .rename(columns={"index": "Classifier"})
911
- )
912
- #! Store results in the main results dictionary
913
- results = {
914
- "selected_features": features_df,
915
- "cv_train_scores": cv_train_results_df,
916
- "cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
917
- "common_features": list(common_features),
918
- }
919
- if all([plot_, dir_save]):
920
- from datetime import datetime
921
-
922
- now_ = datetime.now().strftime("%y%m%d_%H%M%S")
923
- ips.figsave(dir_save + f"features{now_}.pdf")
924
- else:
925
- results = {
926
- "selected_features": pd.DataFrame(),
927
- "cv_train_scores": pd.DataFrame(),
928
- "cv_test_scores": pd.DataFrame(),
929
- "common_features": [],
930
- }
931
- print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
932
- return results
933
-
934
-
935
- #! # usage:
936
- # # Get features and common features
937
- # results = get_features(X, y)
938
- # common_features = results["common_features"]
939
- def validate_features(
940
- x_train: pd.DataFrame,
941
- y_train: pd.Series,
942
- x_true: pd.DataFrame,
943
- y_true: pd.Series,
944
- common_features: set = None,
945
- models: Optional[Dict[str, Any]] = None,
946
- metrics: Optional[list] = None,
947
- random_state: int = 1,
948
- smote: bool = False,
949
- n_jobs: int = -1,
950
- plot_: bool = True,
951
- class_weight: str = "balanced",
952
- ) -> dict:
953
- """
954
- Validate models using selected features on the validation dataset.
955
-
956
- Parameters:
957
- - x_train (pd.DataFrame): Training feature dataset.
958
- - y_train (pd.Series): Training target variable.
959
- - x_true (pd.DataFrame): Validation feature dataset.
960
- - y_true (pd.Series): Validation target variable.
961
- - common_features (set): Set of common features to use for validation.
962
- - models (dict, optional): Dictionary of models to validate.
963
- - metrics (list, optional): List of metrics to compute.
964
- - random_state (int): Random state for reproducibility.
965
- - plot_ (bool): Option to plot metrics (to be implemented if needed).
966
- - class_weight (str or dict): Class weights to handle imbalance.
967
-
968
- """
969
- from tqdm import tqdm
970
-
971
- # Ensure common features are selected
972
- common_features = ips.shared(
973
- common_features, x_train.columns, x_true.columns, strict=True, verbose=False
974
- )
975
-
976
- # Filter the training and validation datasets for the common features
977
- x_train_selected = x_train[common_features]
978
- x_true_selected = x_true[common_features]
979
-
980
- if not x_true_selected.index.equals(y_true.index):
981
- raise ValueError(
982
- "Index mismatch between validation features and target. Ensure data alignment."
983
- )
984
-
985
- y_true = y_true.loc[x_true_selected.index]
986
-
987
- # Handle class imbalance using SMOTE
988
- if smote:
989
- if (
990
- y_train.value_counts(normalize=True).max() < 0.8
991
- ): # Threshold to decide if data is imbalanced
992
- smote = SMOTE(random_state=random_state)
993
- x_train_resampled, y_train_resampled = smote.fit_resample(
994
- x_train_selected, y_train
995
- )
996
- else:
997
- # skip SMOTE
998
- x_train_resampled, y_train_resampled = x_train_selected, y_train
999
- else:
1000
- x_train_resampled, y_train_resampled = x_train_selected, y_train
1001
-
1002
- # Default models if not provided
1003
- if models is None:
1004
- models = {
1005
- "Random Forest": RandomForestClassifier(
1006
- class_weight=class_weight, random_state=random_state
1007
- ),
1008
- "SVM": SVC(probability=True, class_weight=class_weight),
1009
- "Logistic Regression": LogisticRegression(
1010
- class_weight=class_weight, random_state=random_state
1011
- ),
1012
- "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
1013
- "AdaBoost": AdaBoostClassifier(
1014
- random_state=random_state, algorithm="SAMME"
1015
- ),
1016
- "Lasso": LogisticRegression(
1017
- penalty="l1", solver="saga", random_state=random_state
1018
- ),
1019
- "Ridge": LogisticRegression(
1020
- penalty="l2", solver="saga", random_state=random_state
1021
- ),
1022
- "Elastic Net": LogisticRegression(
1023
- penalty="elasticnet",
1024
- solver="saga",
1025
- l1_ratio=0.5,
1026
- random_state=random_state,
1027
- ),
1028
- "XGBoost": xgb.XGBClassifier(eval_metric="logloss"),
1029
- "Naive Bayes": GaussianNB(),
1030
- "LDA": LinearDiscriminantAnalysis(),
1031
- }
1032
-
1033
- # Hyperparameter grids for tuning
1034
- param_grids = {
1035
- "Random Forest": {
1036
- "n_estimators": [100, 200, 300, 400, 500],
1037
- "max_depth": [None, 3, 5, 10, 20],
1038
- "min_samples_split": [2, 5, 10],
1039
- "min_samples_leaf": [1, 2, 4],
1040
- "class_weight": [None, "balanced"],
1041
- },
1042
- "SVM": {
1043
- "C": [0.01, 0.1, 1, 10, 100, 1000],
1044
- "gamma": [0.001, 0.01, 0.1, "scale", "auto"],
1045
- "kernel": ["linear", "rbf", "poly"],
1046
- },
1047
- "Logistic Regression": {
1048
- "C": [0.01, 0.1, 1, 10, 100],
1049
- "solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
1050
- "penalty": ["l1", "l2"],
1051
- "max_iter": [100, 200, 300],
1052
- },
1053
- "Gradient Boosting": {
1054
- "n_estimators": [100, 200, 300, 400, 500],
1055
- "learning_rate": np.logspace(-3, 0, 4),
1056
- "max_depth": [3, 5, 7, 9],
1057
- "min_samples_split": [2, 5, 10],
1058
- },
1059
- "AdaBoost": {
1060
- "n_estimators": [50, 100, 200, 300, 500],
1061
- "learning_rate": np.logspace(-3, 0, 4),
1062
- },
1063
- "Lasso": {"C": np.logspace(-3, 1, 10), "max_iter": [100, 200, 300]},
1064
- "Ridge": {"C": np.logspace(-3, 1, 10), "max_iter": [100, 200, 300]},
1065
- "Elastic Net": {
1066
- "C": np.logspace(-3, 1, 10),
1067
- "l1_ratio": [0.1, 0.5, 0.9],
1068
- "max_iter": [100, 200, 300],
1069
- },
1070
- "XGBoost": {
1071
- "n_estimators": [100, 200],
1072
- "max_depth": [3, 5, 7],
1073
- "learning_rate": [0.01, 0.1, 0.2],
1074
- "subsample": [0.8, 1.0],
1075
- "colsample_bytree": [0.8, 1.0],
1076
- },
1077
- "Naive Bayes": {},
1078
- "LDA": {"solver": ["svd", "lsqr", "eigen"]},
1079
- }
1080
- # Default metrics if not provided
1081
- if metrics is None:
1082
- metrics = [
1083
- "accuracy",
1084
- "precision",
1085
- "recall",
1086
- "f1",
1087
- "roc_auc",
1088
- "mcc",
1089
- "specificity",
1090
- "balanced_accuracy",
1091
- "pr_auc",
1092
- ]
1093
-
1094
- results = {}
1095
-
1096
- # Validate each classifier with GridSearchCV
1097
- for name, clf in tqdm(
1098
- models.items(),
1099
- desc="for metric in metrics",
1100
- colour="green",
1101
- bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
1102
- ):
1103
- print(f"\nValidating {name} on the validation dataset:")
1104
-
1105
- # Check if `predict_proba` method exists; if not, use CalibratedClassifierCV
1106
- # 没有predict_proba的分类器,使用 CalibratedClassifierCV 可以获得校准的概率估计。此外,为了使代码更灵活,我们可以在创建分类器
1107
- # 时检查 predict_proba 方法是否存在,如果不存在且用户希望计算 roc_auc 或 pr_auc,则启用 CalibratedClassifierCV
1108
- if not hasattr(clf, "predict_proba"):
1109
- print(
1110
- f"Using CalibratedClassifierCV for {name} due to lack of probability estimates."
1111
- )
1112
- calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
1113
- else:
1114
- calibrated_clf = clf
1115
- # Stratified K-Fold for cross-validation
1116
- skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
1117
-
1118
- # Create GridSearchCV object
1119
- gs = GridSearchCV(
1120
- estimator=calibrated_clf,
1121
- param_grid=param_grids[name],
1122
- scoring="roc_auc", # Optimize for ROC AUC
1123
- cv=skf, # Stratified K-Folds cross-validation
1124
- n_jobs=n_jobs,
1125
- verbose=1,
1126
- )
1127
-
1128
- # Fit the model using GridSearchCV
1129
- gs.fit(x_train_resampled, y_train_resampled)
1130
- # Best estimator from grid search
1131
- best_clf = gs.best_estimator_
1132
- # Make predictions on the validation set
1133
- y_pred = best_clf.predict(x_true_selected)
1134
- # Calculate probabilities for ROC AUC if possible
1135
- if hasattr(best_clf, "predict_proba"):
1136
- y_pred_proba = best_clf.predict_proba(x_true_selected)[:, 1]
1137
- elif hasattr(best_clf, "decision_function"):
1138
- # If predict_proba is not available, use decision_function (e.g., for SVM)
1139
- y_pred_proba = best_clf.decision_function(x_true_selected)
1140
- # Ensure y_pred_proba is within 0 and 1 bounds
1141
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
1142
- y_pred_proba.max() - y_pred_proba.min()
1143
- )
1144
- else:
1145
- y_pred_proba = None # No probability output for certain models
1146
-
1147
- # Calculate metrics
1148
- validation_scores = {}
1149
- for metric in metrics:
1150
- if metric == "accuracy":
1151
- validation_scores[metric] = accuracy_score(y_true, y_pred)
1152
- elif metric == "precision":
1153
- validation_scores[metric] = precision_score(
1154
- y_true, y_pred, average="weighted"
1155
- )
1156
- elif metric == "recall":
1157
- validation_scores[metric] = recall_score(
1158
- y_true, y_pred, average="weighted"
1159
- )
1160
- elif metric == "f1":
1161
- validation_scores[metric] = f1_score(y_true, y_pred, average="weighted")
1162
- elif metric == "roc_auc" and y_pred_proba is not None:
1163
- validation_scores[metric] = roc_auc_score(y_true, y_pred_proba)
1164
- elif metric == "mcc":
1165
- validation_scores[metric] = matthews_corrcoef(y_true, y_pred)
1166
- elif metric == "specificity":
1167
- tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
1168
- validation_scores[metric] = tn / (tn + fp) # Specificity calculation
1169
- elif metric == "balanced_accuracy":
1170
- validation_scores[metric] = balanced_accuracy_score(y_true, y_pred)
1171
- elif metric == "pr_auc" and y_pred_proba is not None:
1172
- precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
1173
- validation_scores[metric] = average_precision_score(
1174
- y_true, y_pred_proba
1175
- )
1176
-
1177
- # Calculate ROC curve
1178
- # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
1179
- if y_pred_proba is not None:
1180
- # fpr, tpr, roc_auc = dict(), dict(), dict()
1181
- fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
1182
- lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False)
1183
- roc_auc = auc(fpr, tpr)
1184
- roc_info = {
1185
- "fpr": fpr.tolist(),
1186
- "tpr": tpr.tolist(),
1187
- "auc": roc_auc,
1188
- "ci95": (lower_ci, upper_ci),
1189
- }
1190
- # precision-recall curve
1191
- precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
1192
- avg_precision_ = average_precision_score(y_true, y_pred_proba)
1193
- pr_info = {
1194
- "precision": precision_,
1195
- "recall": recall_,
1196
- "avg_precision": avg_precision_,
1197
- }
1198
- else:
1199
- roc_info, pr_info = None, None
1200
- results[name] = {
1201
- "best_params": gs.best_params_,
1202
- "scores": validation_scores,
1203
- "roc_curve": roc_info,
1204
- "pr_curve": pr_info,
1205
- "confusion_matrix": confusion_matrix(y_true, y_pred),
1206
- }
1207
-
1208
- df_results = pd.DataFrame.from_dict(results, orient="index")
1209
-
1210
- return df_results
1211
-
1212
-
1213
- #! usage validate_features()
1214
- # Validate models using the validation dataset (X_val, y_val)
1215
- # validation_results = validate_features(X, y, X_val, y_val, common_features)
1216
-
1217
-
1218
- # # If you want to access validation scores
1219
- # print(validation_results)
1220
- def plot_validate_features(res_val):
1221
- """
1222
- plot the results of 'validate_features()'
1223
- """
1224
- colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
1225
- if res_val.shape[0] > 5:
1226
- alpha = 0
1227
- figsize = [8, 10]
1228
- subplot_layout = [1, 2]
1229
- ncols = 2
1230
- bbox_to_anchor = [1.5, 0.6]
1231
- else:
1232
- alpha = 0.03
1233
- figsize = [10, 6]
1234
- subplot_layout = [1, 1]
1235
- ncols = 1
1236
- bbox_to_anchor = [1, 1]
1237
- nexttile = plot.subplot(figsize=figsize)
1238
- ax = nexttile(subplot_layout[0], subplot_layout[1])
1239
- for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1240
- fpr = res_val["roc_curve"][model_name]["fpr"]
1241
- tpr = res_val["roc_curve"][model_name]["tpr"]
1242
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1243
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1244
- plot_roc_curve(
1245
- fpr,
1246
- tpr,
1247
- mean_auc,
1248
- lower_ci,
1249
- upper_ci,
1250
- model_name=model_name,
1251
- lw=1.5,
1252
- color=colors[i],
1253
- alpha=alpha,
1254
- ax=ax,
1255
- )
1256
- plot.figsets(
1257
- sp=2,
1258
- legend=dict(
1259
- loc="upper right",
1260
- ncols=ncols,
1261
- fontsize=8,
1262
- bbox_to_anchor=[1.5, 0.6],
1263
- markerscale=0.8,
1264
- ),
1265
- )
1266
- # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
1267
-
1268
- ax = nexttile(subplot_layout[0], subplot_layout[1])
1269
- for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1270
- plot_pr_curve(
1271
- recall=res_val["pr_curve"][model_name]["recall"],
1272
- precision=res_val["pr_curve"][model_name]["precision"],
1273
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1274
- model_name=model_name,
1275
- color=colors[i],
1276
- lw=1.5,
1277
- alpha=alpha,
1278
- ax=ax,
1279
- )
1280
- plot.figsets(
1281
- sp=2,
1282
- legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
1283
- )
1284
- # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
1285
-
1286
-
1287
- def plot_validate_features_single(res_val, figsize=None):
1288
- if figsize is None:
1289
- nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3)
1290
- else:
1291
- nexttile = plot.subplot(
1292
- len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
1293
- )
1294
- for model_name in ips.flatten(res_val["pr_curve"].index):
1295
- fpr = res_val["roc_curve"][model_name]["fpr"]
1296
- tpr = res_val["roc_curve"][model_name]["tpr"]
1297
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1298
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1299
-
1300
- # Plotting
1301
- plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci,
1302
- model_name=model_name, ax=nexttile())
1303
- plot.figsets(title=model_name, sp=2)
1304
-
1305
- plot_pr_binary(
1306
- recall=res_val["pr_curve"][model_name]["recall"],
1307
- precision=res_val["pr_curve"][model_name]["precision"],
1308
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1309
- model_name=model_name,
1310
- ax=nexttile(),
1311
- )
1312
- plot.figsets(title=model_name, sp=2)
1313
-
1314
- # plot cm
1315
- plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
1316
- plot.figsets(title=model_name, sp=2)
1317
-
1318
-
1319
- def cal_auc_ci(
1320
- y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1, verbose=True
1321
- ):
1322
- y_true = np.asarray(y_true)
1323
- y_pred = np.asarray(y_pred)
1324
- bootstrapped_scores = []
1325
- if verbose:
1326
- print("auroc score:", roc_auc_score(y_true, y_pred))
1327
- rng = np.random.RandomState(random_state)
1328
- for i in range(n_bootstraps):
1329
- # bootstrap by sampling with replacement on the prediction indices
1330
- indices = rng.randint(0, len(y_pred), len(y_pred))
1331
- if len(np.unique(y_true[indices])) < 2:
1332
- # We need at least one positive and one negative sample for ROC AUC
1333
- # to be defined: reject the sample
1334
- continue
1335
- if isinstance(y_true, np.ndarray):
1336
- score = roc_auc_score(y_true[indices], y_pred[indices])
1337
- else:
1338
- score = roc_auc_score(y_true.iloc[indices], y_pred.iloc[indices])
1339
- bootstrapped_scores.append(score)
1340
- # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
1341
- sorted_scores = np.array(bootstrapped_scores)
1342
- sorted_scores.sort()
1343
-
1344
- # Computing the lower and upper bound of the 90% confidence interval
1345
- # You can change the bounds percentiles to 0.025 and 0.975 to get
1346
- # a 95% confidence interval instead.
1347
- confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1348
- confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1349
- if verbose:
1350
- print(
1351
- "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
1352
- confidence_lower, confidence_upper
1353
- )
1354
- )
1355
- return confidence_lower, confidence_upper
1356
-
1357
-
1358
- def plot_roc_curve(
1359
- fpr=None,
1360
- tpr=None,
1361
- mean_auc=None,
1362
- lower_ci=None,
1363
- upper_ci=None,
1364
- model_name=None,
1365
- color="#FF8F00",
1366
- lw=2,
1367
- alpha=0.1,
1368
- ci_display=True,
1369
- title="ROC Curve",
1370
- xlabel="1−Specificity",
1371
- ylabel="Sensitivity",
1372
- legend_loc="lower right",
1373
- diagonal_color="0.5",
1374
- figsize=(5, 5),
1375
- ax=None,
1376
- **kwargs,
1377
- ):
1378
- if ax is None:
1379
- fig, ax = plt.subplots(figsize=figsize)
1380
- if mean_auc is not None:
1381
- model_name = "ROC curve" if model_name is None else model_name
1382
- if ci_display:
1383
- label = f"{model_name} (AUC = {mean_auc:.3f})\n95% CI: {lower_ci:.3f} - {upper_ci:.3f}"
1384
- else:
1385
- label = f"{model_name} (AUC = {mean_auc:.3f})"
1386
- else:
1387
- label = None
1388
-
1389
- # Plot ROC curve and the diagonal reference line
1390
- ax.fill_between(fpr, tpr, alpha=alpha, color=color)
1391
- ax.plot([0, 1], [0, 1], color=diagonal_color, clip_on=False, linestyle="--")
1392
- ax.plot(fpr, tpr, color=color, lw=lw, label=label, clip_on=False, **kwargs)
1393
- # Setting plot limits, labels, and title
1394
- ax.set_xlim([-0.01, 1.0])
1395
- ax.set_ylim([0.0, 1.0])
1396
- ax.set_xlabel(xlabel)
1397
- ax.set_ylabel(ylabel)
1398
- ax.set_title(title)
1399
- ax.legend(loc=legend_loc)
1400
- return ax
1401
-
1402
-
1403
- # * usage: ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
1404
- # for model_name in flatten(validation_results["roc_curve"].keys())[2:]:
1405
- # fpr = validation_results["roc_curve"][model_name]["fpr"]
1406
- # tpr = validation_results["roc_curve"][model_name]["tpr"]
1407
- # (lower_ci, upper_ci) = validation_results["roc_curve"][model_name]["ci95"]
1408
- # mean_auc = validation_results["roc_curve"][model_name]["auc"]
1409
-
1410
- # # Plotting
1411
- # ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
1412
- # figsets(title=model_name)
1413
-
1414
- def plot_pr_curve(
1415
- recall=None,
1416
- precision=None,
1417
- avg_precision=None,
1418
- model_name=None,
1419
- lw=2,
1420
- figsize=[5, 5],
1421
- title="Precision-Recall Curve",
1422
- xlabel="Recall",
1423
- ylabel="Precision",
1424
- alpha=0.1,
1425
- color="#FF8F00",
1426
- legend_loc="lower left",
1427
- ax=None,
1428
- **kwargs,
1429
- ):
1430
- if ax is None:
1431
- fig, ax = plt.subplots(figsize=figsize)
1432
- model_name = "PR curve" if model_name is None else model_name
1433
- # Plot Precision-Recall curve
1434
- ax.plot(
1435
- recall,
1436
- precision,
1437
- lw=lw,
1438
- color=color,
1439
- label=(f"{model_name} (AP={avg_precision:.2f})"),
1440
- clip_on=False,
1441
- **kwargs,
1442
- )
1443
- # Fill area under the curve
1444
- ax.fill_between(recall, precision, alpha=alpha, color=color)
1445
-
1446
- # Customize axes
1447
- ax.set_title(title)
1448
- ax.set_xlabel(xlabel)
1449
- ax.set_ylabel(ylabel)
1450
- ax.set_xlim([-0.01, 1.0])
1451
- ax.set_ylim([0.0, 1.0])
1452
- ax.grid(False)
1453
- ax.legend(loc=legend_loc)
1454
- return ax
1455
-
1456
- # * usage: ml2ls.plot_pr_curve()
1457
- # for md_name in flatten(validation_results["pr_curve"].keys()):
1458
- # ml2ls.plot_pr_curve(
1459
- # recall=validation_results["pr_curve"][md_name]["recall"],
1460
- # precision=validation_results["pr_curve"][md_name]["precision"],
1461
- # avg_precision=validation_results["pr_curve"][md_name]["avg_precision"],
1462
- # model_name=md_name,
1463
- # lw=2,
1464
- # alpha=0.1,
1465
- # color="r",
1466
- # )
1467
-
1468
- def plot_pr_binary(
1469
- recall=None,
1470
- precision=None,
1471
- avg_precision=None,
1472
- model_name=None,
1473
- lw=2,
1474
- figsize=[5, 5],
1475
- title="Precision-Recall Curve",
1476
- xlabel="Recall",
1477
- ylabel="Precision",
1478
- alpha=0.1,
1479
- color="#FF8F00",
1480
- legend_loc="lower left",
1481
- ax=None,
1482
- show_avg_precision=False,
1483
- **kwargs,
1484
- ):
1485
- from scipy.interpolate import interp1d
1486
- if ax is None:
1487
- fig, ax = plt.subplots(figsize=figsize)
1488
- model_name = "Binary PR Curve" if model_name is None else model_name
1489
-
1490
- #* use sklearn bulitin function 'PrecisionRecallDisplay'?
1491
- # from sklearn.metrics import PrecisionRecallDisplay
1492
- # disp = PrecisionRecallDisplay(precision=precision,
1493
- # recall=recall,
1494
- # average_precision=avg_precision,**kwargs)
1495
- # disp.plot(ax=ax, name=model_name, color=color)
1496
-
1497
- # Plot Precision-Recall curve
1498
- ax.plot(
1499
- recall,
1500
- precision,
1501
- lw=lw,
1502
- color=color,
1503
- label=(f"{model_name} (AP={avg_precision:.2f})"),
1504
- clip_on=False,
1505
- **kwargs,
1506
- )
1507
-
1508
- # Fill area under the curve
1509
- ax.fill_between(recall, precision, alpha=alpha, color=color)
1510
- # Add F1 score iso-contours
1511
- f_scores = np.linspace(0.2, 0.8, num=4)
1512
- # for f_score in f_scores:
1513
- # x = np.linspace(0.01, 1)
1514
- # y = f_score * x / (2 * x - f_score)
1515
- # plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=1)
1516
- # plt.annotate(f"$f_1={f_score:0.1f}$", xy=(0.8, y[45] + 0.02))
1517
-
1518
- pr_boundary = interp1d(recall, precision, kind="linear", fill_value="extrapolate")
1519
- for f_score in f_scores:
1520
- x_vals = np.linspace(0.01, 1, 10000)
1521
- y_vals = f_score * x_vals / (2 * x_vals - f_score)
1522
- y_vals_clipped = np.minimum(y_vals, pr_boundary(x_vals))
1523
- y_vals_clipped = np.clip(y_vals_clipped, 1e-3, None) # Prevent going to zero
1524
- valid = y_vals_clipped < pr_boundary(x_vals)
1525
- valid_ = y_vals_clipped > 1e-3
1526
- valid = valid&valid_
1527
- x_vals = x_vals[valid]
1528
- y_vals_clipped = y_vals_clipped[valid]
1529
- if len(x_vals) > 0: # Ensure annotation is placed only if line segment exists
1530
- ax.plot(x_vals, y_vals_clipped, color="gray", alpha=1)
1531
- plt.annotate(f"$f_1={f_score:0.1f}$", xy=(0.8, y_vals_clipped[-int(len(y_vals_clipped)*0.35)] + 0.02))
1532
-
1533
-
1534
- # # Plot the average precision line
1535
- if show_avg_precision:
1536
- plt.axhline(
1537
- y=avg_precision,
1538
- color="red",
1539
- ls="--",
1540
- lw=lw,
1541
- label=f"Avg. precision={avg_precision:.2f}",
1542
- )
1543
- # Customize axes
1544
- ax.set_title(title)
1545
- ax.set_xlabel(xlabel)
1546
- ax.set_ylabel(ylabel)
1547
- ax.set_xlim([-0.01, 1.0])
1548
- ax.set_ylim([0.0, 1.0])
1549
- ax.grid(False)
1550
- ax.legend(loc=legend_loc)
1551
- return ax
1552
-
1553
- def plot_cm(
1554
- cm,
1555
- labels_name=None,
1556
- thresh=0.8,
1557
- axis_labels=None,
1558
- cmap="Reds",
1559
- normalize=True,
1560
- xlabel="Predicted Label",
1561
- ylabel="Actual Label",
1562
- fontsize=12,
1563
- figsize=[5, 5],
1564
- ax=None,
1565
- ):
1566
- if ax is None:
1567
- fig, ax = plt.subplots(figsize=figsize)
1568
-
1569
- cm_normalized = np.round(
1570
- cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100, 2
1571
- )
1572
- cm_value = cm_normalized if normalize else cm.astype("int")
1573
- # Plot the heatmap
1574
- cax = ax.imshow(cm_normalized, interpolation="nearest", cmap=cmap)
1575
- plt.colorbar(cax, ax=ax, fraction=0.046, pad=0.04)
1576
- cax.set_clim(0, 100)
1577
-
1578
- # Define tick labels based on provided labels
1579
- num_local = np.arange(len(labels_name)) if labels_name is not None else range(2)
1580
- if axis_labels is None:
1581
- axis_labels = labels_name if labels_name is not None else ["No", "Yes"]
1582
- ax.set_xticks(num_local)
1583
- ax.set_xticklabels(axis_labels)
1584
- ax.set_yticks(num_local)
1585
- ax.set_yticklabels(axis_labels)
1586
- ax.set_ylabel(ylabel)
1587
- ax.set_xlabel(xlabel)
1588
-
1589
- # Add TN, FP, FN, TP annotations specifically for binary classification (2x2 matrix)
1590
- if labels_name is None or len(labels_name) == 2:
1591
- # True Negative (TN), False Positive (FP), False Negative (FN), and True Positive (TP)
1592
- # Predicted
1593
- # 0 | 1
1594
- # ----------------
1595
- # 0 | TN | FP
1596
- # Actual ----------------
1597
- # 1 | FN | TP
1598
- tn_label = "TN"
1599
- fp_label = "FP"
1600
- fn_label = "FN"
1601
- tp_label = "TP"
1602
-
1603
- # Adjust positions slightly for TN, FP, FN, TP labels
1604
- ax.text(
1605
- 0,
1606
- 0,
1607
- (
1608
- f"{tn_label}:{cm_normalized[0, 0]:.2f}%"
1609
- if normalize
1610
- else f"{tn_label}:{cm_value[0, 0]}"
1611
- ),
1612
- ha="center",
1613
- va="center",
1614
- color="white" if cm_normalized[0, 0] > thresh * 100 else "black",
1615
- fontsize=fontsize,
1616
- )
1617
- ax.text(
1618
- 1,
1619
- 0,
1620
- (
1621
- f"{fp_label}:{cm_normalized[0, 1]:.2f}%"
1622
- if normalize
1623
- else f"{fp_label}:{cm_value[0, 1]}"
1624
- ),
1625
- ha="center",
1626
- va="center",
1627
- color="white" if cm_normalized[0, 1] > thresh * 100 else "black",
1628
- fontsize=fontsize,
1629
- )
1630
- ax.text(
1631
- 0,
1632
- 1,
1633
- (
1634
- f"{fn_label}:{cm_normalized[1, 0]:.2f}%"
1635
- if normalize
1636
- else f"{fn_label}:{cm_value[1, 0]}"
1637
- ),
1638
- ha="center",
1639
- va="center",
1640
- color="white" if cm_normalized[1, 0] > thresh * 100 else "black",
1641
- fontsize=fontsize,
1642
- )
1643
- ax.text(
1644
- 1,
1645
- 1,
1646
- (
1647
- f"{tp_label}:{cm_normalized[1, 1]:.2f}%"
1648
- if normalize
1649
- else f"{tp_label}:{cm_value[1, 1]}"
1650
- ),
1651
- ha="center",
1652
- va="center",
1653
- color="white" if cm_normalized[1, 1] > thresh * 100 else "black",
1654
- fontsize=fontsize,
1655
- )
1656
- else:
1657
- # Annotate cells with normalized percentage values
1658
- for i in range(len(labels_name)):
1659
- for j in range(len(labels_name)):
1660
- val = cm_normalized[i, j]
1661
- color = "white" if val > thresh * 100 else "black"
1662
- ax.text(
1663
- j,
1664
- i,
1665
- f"{val:.2f}%",
1666
- ha="center",
1667
- va="center",
1668
- color=color,
1669
- fontsize=fontsize,
1670
- )
1671
-
1672
- plot.figsets(ax=ax, boxloc="none")
1673
- return ax
1674
-
1675
-
1676
- def rank_models(
1677
- cv_test_scores,
1678
- rm_outlier=False,
1679
- metric_weights=None,
1680
- plot_=True,
1681
- ):
1682
- """
1683
- Selects the best model based on a multi-metric scoring approach, with outlier handling, optional visualization,
1684
- and additional performance metrics.
1685
-
1686
- Parameters:
1687
- - cv_test_scores (pd.DataFrame): DataFrame with cross-validation results across multiple metrics.
1688
- Assumes columns are 'Classifier', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'.
1689
- - metric_weights (dict): Dictionary specifying weights for each metric (e.g., {'accuracy': 0.2, 'precision': 0.3, ...}).
1690
- If None, default weights are applied equally across available metrics.
1691
- a. equal_weights(standard approch): 所有的metrics同等重要
1692
- e.g., {"accuracy": 0.2, "precision": 0.2, "recall": 0.2, "f1": 0.2, "roc_auc": 0.2}
1693
- b. accuracy_focosed: classification correctness (e.g., in balanced datasets), accuracy might be weighted more heavily.
1694
- e.g., {"accuracy": 0.4, "precision": 0.2, "recall": 0.2, "f1": 0.1, "roc_auc": 0.1}
1695
- c. Precision and Recall Emphasis: In cases where false positives and false negatives are particularly important (such as
1696
- in medical applications or fraud detection), precision and recall may be weighted more heavily.
1697
- e.g., {"accuracy": 0.2, "precision": 0.3, "recall": 0.3, "f1": 0.1, "roc_auc": 0.1}
1698
- d. F1-Focused: When balance between precision and recall is crucial (e.g., in imbalanced datasets)
1699
- e.g., {"accuracy": 0.2, "precision": 0.2, "recall": 0.2, "f1": 0.3, "roc_auc": 0.1}
1700
- e. ROC-AUC Emphasis: In some cases, ROC AUC may be prioritized, particularly in classification tasks where class imbalance
1701
- is present, as ROC AUC accounts for the model's performance across all classification thresholds.
1702
- e.g., {"accuracy": 0.1, "precision": 0.2, "recall": 0.2, "f1": 0.3, "roc_auc": 0.3}
1703
-
1704
- - normalize (bool): Whether to normalize scores of each metric to range [0, 1].
1705
- - visualize (bool): If True, generates visualizations (e.g., bar plot, radar chart).
1706
- - outlier_threshold (float): The threshold to detect outliers using the IQR method. Default is 1.5.
1707
- - cv_folds (int): The number of cross-validation folds used.
1708
-
1709
- Returns:
1710
- - best_model (str): Name of the best model based on the combined metric scores.
1711
- - scored_df (pd.DataFrame): DataFrame with an added 'combined_score' column used for model selection.
1712
- - visualizations (dict): A dictionary containing visualizations if `visualize=True`.
1713
- """
1714
- from sklearn.preprocessing import MinMaxScaler
1715
- import seaborn as sns
1716
- import matplotlib.pyplot as plt
1717
- from py2ls import plot
1718
-
1719
- # Check for missing metrics and set default weights if not provided
1720
- available_metrics = cv_test_scores.columns[1:] # Exclude 'Classifier' column
1721
- if metric_weights is None:
1722
- metric_weights = {
1723
- metric: 1 / len(available_metrics) for metric in available_metrics
1724
- } # Equal weight if not specified
1725
- elif metric_weights == "a":
1726
- metric_weights = {
1727
- "accuracy": 0.2,
1728
- "precision": 0.2,
1729
- "recall": 0.2,
1730
- "f1": 0.2,
1731
- "roc_auc": 0.2,
1732
- }
1733
- elif metric_weights == "b":
1734
- metric_weights = {
1735
- "accuracy": 0.4,
1736
- "precision": 0.2,
1737
- "recall": 0.2,
1738
- "f1": 0.1,
1739
- "roc_auc": 0.1,
1740
- }
1741
- elif metric_weights == "c":
1742
- metric_weights = {
1743
- "accuracy": 0.2,
1744
- "precision": 0.3,
1745
- "recall": 0.3,
1746
- "f1": 0.1,
1747
- "roc_auc": 0.1,
1748
- }
1749
- elif metric_weights == "d":
1750
- metric_weights = {
1751
- "accuracy": 0.2,
1752
- "precision": 0.2,
1753
- "recall": 0.2,
1754
- "f1": 0.3,
1755
- "roc_auc": 0.1,
1756
- }
1757
- elif metric_weights == "e":
1758
- metric_weights = {
1759
- "accuracy": 0.1,
1760
- "precision": 0.2,
1761
- "recall": 0.2,
1762
- "f1": 0.3,
1763
- "roc_auc": 0.3,
1764
- }
1765
- else:
1766
- metric_weights = {
1767
- metric: 1 / len(available_metrics) for metric in available_metrics
1768
- }
1769
-
1770
- # Normalize weights if they don’t sum to 1
1771
- total_weight = sum(metric_weights.values())
1772
- metric_weights = {
1773
- metric: weight / total_weight for metric, weight in metric_weights.items()
1774
- }
1775
- if rm_outlier:
1776
- cv_test_scores_ = ips.df_outlier(cv_test_scores)
1777
- else:
1778
- cv_test_scores_ = cv_test_scores
1779
-
1780
- # Normalize the scores of metrics if normalize is True
1781
- scaler = MinMaxScaler()
1782
- normalized_scores = pd.DataFrame(
1783
- scaler.fit_transform(cv_test_scores_[available_metrics]),
1784
- columns=available_metrics,
1785
- )
1786
- cv_test_scores_ = pd.concat(
1787
- [cv_test_scores_[["Classifier"]], normalized_scores], axis=1
1788
- )
1789
-
1790
- # Calculate weighted scores for each model
1791
- cv_test_scores_["combined_score"] = sum(
1792
- cv_test_scores_[metric] * weight for metric, weight in metric_weights.items()
1793
- )
1794
- top_models = cv_test_scores_.sort_values(by="combined_score", ascending=False)
1795
- cv_test_scores = cv_test_scores.loc[top_models.index]
1796
- top_models.reset_index(drop=True, inplace=True)
1797
- cv_test_scores.reset_index(drop=True, inplace=True)
1798
-
1799
- if plot_:
1800
-
1801
- def generate_bar_plot(ax, cv_test_scores):
1802
- ax = plot.plotxy(
1803
- y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
1804
- )
1805
- plt.title("Classifier Performance")
1806
- plt.tight_layout()
1807
- return plt
1808
-
1809
- nexttile = plot.subplot(2, 2, figsize=[10, 7])
1810
- generate_bar_plot(nexttile(), top_models.dropna())
1811
- plot.radar(
1812
- ax=nexttile(projection="polar"),
1813
- data=cv_test_scores.set_index("Classifier"),
1814
- ylim=[0.5, 1],
1815
- color=plot.get_color(10),
1816
- alpha=0.05,
1817
- circular=1,
1818
- )
1819
- return cv_test_scores
1820
-
1821
-
1822
- # # Example Usage:
1823
- # metric_weights = {
1824
- # "accuracy": 0.2,
1825
- # "precision": 0.3,
1826
- # "recall": 0.2,
1827
- # "f1": 0.2,
1828
- # "roc_auc": 0.1,
1829
- # }
1830
- # cv_test_scores = res["cv_test_scores"].copy()
1831
- # best_model = rank_models(
1832
- # cv_test_scores, metric_weights=metric_weights, normalize=True, plot_=True
1833
- # )
1834
-
1835
- # figsave("classifier_performance.pdf")
1836
-
1837
-
1838
- def predict(
1839
- x_train: pd.DataFrame,
1840
- y_train: pd.Series,
1841
- x_true: pd.DataFrame = None,
1842
- y_true: Optional[pd.Series] = None,
1843
- common_features: set = None,
1844
- purpose: str = "classification", # 'classification' or 'regression'
1845
- cls: Optional[Dict[str, Any]] = None,
1846
- metrics: Optional[List[str]] = None,
1847
- random_state: int = 1,
1848
- smote: bool = False,
1849
- n_jobs: int = -1,
1850
- plot_: bool = True,
1851
- dir_save: str = "./",
1852
- test_size: float = 0.2, # specific only when x_true is None
1853
- cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
1854
- cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
1855
- class_weight: str = "balanced",
1856
- verbose: bool = False,
1857
- ) -> pd.DataFrame:
1858
- """
1859
- 第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
1860
- Usage:
1861
- (1). predict(x_train, y_train,...) 对 x_train 进行拆分训练/测试集,并在测试集上进行验证.
1862
- predict 函数会根据 test_size 参数,将 x_train 和 y_train 拆分出内部测试集。然后模型会在拆分出的训练集上进行训练,并在测试集上验证效果。
1863
- (2). predict(x_train, y_train, x_true,...)使用 x_train 和 y_train 训练并对 x_true 进行预测
1864
- 由于传入了 x_true,函数会跳过 x_train 的拆分,直接使用全部的 x_train 和 y_train 进行训练。然后对 x_true 进行预测,但由于没有提供 y_true,
1865
- 因此无法与真实值进行对比。
1866
- (3). predict(x_train, y_train, x_true, y_true,...)使用 x_train 和 y_train 训练,并验证 x_true 与真实标签 y_true.
1867
- predict 函数会在 x_train 和 y_train 上进行训练,并将 x_true 作为测试集。由于提供了 y_true,函数可以将预测结果与 y_true 进行对比,从而
1868
- 计算验证指标,完成对 x_true 的真正验证。
1869
- trains and validates a variety of machine learning models for both classification and regression tasks.
1870
- It supports hyperparameter tuning with grid search and includes additional features like cross-validation,
1871
- feature scaling, and handling of class imbalance through SMOTE.
1872
-
1873
- Parameters:
1874
- - x_train (pd.DataFrame):Training feature data, structured with each row as an observation and each column as a feature.
1875
- - y_train (pd.Series):Target variable for the training dataset.
1876
- - x_true (pd.DataFrame, optional):Test feature data. If not provided, the function splits x_train based on test_size.
1877
- - y_true (pd.Series, optional):Test target values. If not provided, y_train is split into training and testing sets.
1878
- - common_features (set, optional):Specifies a subset of features common across training and test data.
1879
- - purpose (str, default = "classification"):Defines whether the task is "classification" or "regression". Determines which
1880
- metrics and models are applied.
1881
- - cls (dict, optional):Dictionary to specify custom classifiers/regressors. Defaults to a set of common models if not provided.
1882
- - metrics (list, optional):List of evaluation metrics (like accuracy, F1 score) used for model evaluation.
1883
- - random_state (int, default = 1):Random seed to ensure reproducibility.
1884
- - smote (bool, default = False):Applies Synthetic Minority Oversampling Technique (SMOTE) to address class imbalance if enabled.
1885
- - n_jobs (int, default = -1):Number of parallel jobs for computation. Set to -1 to use all available cores.
1886
- - plot_ (bool, default = True):If True, generates plots of the model evaluation metrics.
1887
- - test_size (float, default = 0.2):Test data proportion if x_true is not provided.
1888
- - cv_folds (int, default = 5):Number of cross-validation folds.
1889
- - cv_level (str, default = "l"):Sets the detail level of cross-validation. "s" for low, "m" for medium, and "l" for high.
1890
- - class_weight (str, default = "balanced"):Balances class weights in classification tasks.
1891
- - verbose (bool, default = False):If True, prints detailed output during model training.
1892
- - dir_save (str, default = "./"):Directory path to save plot outputs and results.
1893
-
1894
- Key Steps in the Function:
1895
- Model Initialization: Depending on purpose, initializes either classification or regression models.
1896
- Feature Selection: Ensures training and test sets have matching feature columns.
1897
- SMOTE Application: Balances classes if smote is enabled and the task is classification.
1898
- Cross-Validation and Hyperparameter Tuning: Utilizes GridSearchCV for model tuning based on cv_level.
1899
- Evaluation and Plotting: Outputs evaluation metrics like AUC, confusion matrices, and optional plotting of performance metrics.
1900
- """
1901
- from tqdm import tqdm
1902
- from sklearn.ensemble import (
1903
- RandomForestClassifier,
1904
- RandomForestRegressor,
1905
- ExtraTreesClassifier,
1906
- ExtraTreesRegressor,
1907
- BaggingClassifier,
1908
- BaggingRegressor,
1909
- AdaBoostClassifier,
1910
- AdaBoostRegressor,
1911
- )
1912
- from sklearn.svm import SVC, SVR
1913
- from sklearn.tree import DecisionTreeRegressor
1914
- from sklearn.linear_model import (
1915
- LogisticRegression,
1916
- ElasticNet,
1917
- ElasticNetCV,
1918
- LinearRegression,
1919
- Lasso,
1920
- RidgeClassifierCV,
1921
- Perceptron,
1922
- SGDClassifier,
1923
- )
1924
- from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
1925
- from sklearn.naive_bayes import GaussianNB, BernoulliNB
1926
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
1927
- import xgboost as xgb
1928
- import lightgbm as lgb
1929
- import catboost as cb
1930
- from sklearn.neural_network import MLPClassifier, MLPRegressor
1931
- from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
1932
- from sklearn.discriminant_analysis import (
1933
- LinearDiscriminantAnalysis,
1934
- QuadraticDiscriminantAnalysis,
1935
- )
1936
- from sklearn.preprocessing import PolynomialFeatures
1937
-
1938
- # 拼写检查
1939
- purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
1940
- print(f"{purpose} processing...")
1941
- # Default models or regressors if not provided
1942
- if purpose == "classification":
1943
- model_ = {
1944
- "Random Forest": RandomForestClassifier(
1945
- random_state=random_state, class_weight=class_weight
1946
- ),
1947
- # SVC (Support Vector Classification)
1948
- "SVM": SVC(
1949
- kernel="rbf",
1950
- probability=True,
1951
- class_weight=class_weight,
1952
- random_state=random_state,
1953
- ),
1954
- # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
1955
- "Logistic Regression": LogisticRegression(
1956
- class_weight=class_weight, random_state=random_state
1957
- ),
1958
- # Logistic Regression with L1 Regularization (Lasso)
1959
- "Lasso Logistic Regression": LogisticRegression(
1960
- penalty="l1", solver="saga", random_state=random_state
1961
- ),
1962
- "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
1963
- "XGBoost": xgb.XGBClassifier(
1964
- eval_metric="logloss",
1965
- random_state=random_state,
1966
- ),
1967
- "KNN": KNeighborsClassifier(n_neighbors=5),
1968
- "Naive Bayes": GaussianNB(),
1969
- "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
1970
- "AdaBoost": AdaBoostClassifier(
1971
- algorithm="SAMME", random_state=random_state
1972
- ),
1973
- # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
1974
- "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
1975
- "Extra Trees": ExtraTreesClassifier(
1976
- random_state=random_state, class_weight=class_weight
1977
- ),
1978
- "Bagging": BaggingClassifier(random_state=random_state),
1979
- "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
1980
- "DecisionTree": DecisionTreeClassifier(),
1981
- "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
1982
- "Ridge": RidgeClassifierCV(
1983
- class_weight=class_weight, store_cv_results=True
1984
- ),
1985
- "Perceptron": Perceptron(random_state=random_state),
1986
- "Bernoulli Naive Bayes": BernoulliNB(),
1987
- "SGDClassifier": SGDClassifier(random_state=random_state),
1988
- }
1989
- elif purpose == "regression":
1990
- model_ = {
1991
- "Random Forest": RandomForestRegressor(random_state=random_state),
1992
- "SVM": SVR(), # SVR (Support Vector Regression)
1993
- # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
1994
- "LassoCV": LassoCV(
1995
- cv=cv_folds, random_state=random_state
1996
- ), # LassoCV自动找出最适alpha,优于Lasso
1997
- "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
1998
- "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
1999
- "Linear Regression": LinearRegression(),
2000
- "Lasso": Lasso(random_state=random_state),
2001
- "AdaBoost": AdaBoostRegressor(random_state=random_state),
2002
- # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2003
- "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2004
- "Extra Trees": ExtraTreesRegressor(random_state=random_state),
2005
- "Bagging": BaggingRegressor(random_state=random_state),
2006
- "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2007
- "ElasticNet": ElasticNet(random_state=random_state),
2008
- "Ridge": Ridge(),
2009
- "KNN": KNeighborsRegressor(),
2010
- }
2011
- # indicate cls:
2012
- if ips.run_once_within(30): # 10 min
2013
- print(f"supported models: {list(model_.keys())}")
2014
- if cls is None:
2015
- models = model_
2016
- else:
2017
- if not isinstance(cls, list):
2018
- cls = [cls]
2019
- models = {}
2020
- for cls_ in cls:
2021
- cls_ = ips.strcmp(cls_, list(model_.keys()))[0]
2022
- models[cls_] = model_[cls_]
2023
- if "LightGBM" in models:
2024
- x_train = ips.df_special_characters_cleaner(x_train)
2025
- x_true = (
2026
- ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2027
- )
2028
-
2029
- if isinstance(y_train, str) and y_train in x_train.columns:
2030
- y_train_col_name = y_train
2031
- y_train = x_train[y_train]
2032
- # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
2033
- x_train = x_train.drop(y_train_col_name, axis=1)
2034
- # else:
2035
- # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2036
- y_train=pd.DataFrame(y_train)
2037
- y_train_=ips.df_encoder(y_train, method="dummy")
2038
- is_binary = False if y_train_.shape[1] >1 else True
2039
- print(is_binary)
2040
- if is_binary:
2041
- y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2042
- if x_true is None:
2043
- x_train, x_true, y_train, y_true = train_test_split(
2044
- x_train,
2045
- y_train,
2046
- test_size=test_size,
2047
- random_state=random_state,
2048
- stratify=y_train if purpose == "classification" else None,
2049
- )
2050
- if isinstance(y_train, str) and y_train in x_train.columns:
2051
- y_train_col_name = y_train
2052
- y_train = x_train[y_train]
2053
- y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
2054
- x_train = x_train.drop(y_train_col_name, axis=1)
2055
- else:
2056
- y_train = ips.df_encoder(
2057
- pd.DataFrame(y_train), method="dummy"
2058
- ).values.ravel()
2059
-
2060
- if y_true is not None:
2061
- if isinstance(y_true, str) and y_true in x_true.columns:
2062
- y_true_col_name = y_true
2063
- y_true = x_true[y_true]
2064
- # y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy")
2065
- y_true = pd.DataFrame(y_true)
2066
- x_true = x_true.drop(y_true_col_name, axis=1)
2067
- # else:
2068
- # y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy").values.ravel()
2069
-
2070
- # to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
2071
-
2072
- # y_train=y_train.values.ravel() if y_train is not None else None
2073
- # y_true=y_true.values.ravel() if y_true is not None else None
2074
- y_train = (
2075
- y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
2076
- )
2077
- print(len(y_train),len(y_true))
2078
- y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
2079
- print(len(y_train),len(y_true))
2080
- # Ensure common features are selected
2081
- if common_features is not None:
2082
- x_train, x_true = x_train[common_features], x_true[common_features]
2083
- else:
2084
- share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
2085
- x_train, x_true = x_train[share_col_names], x_true[share_col_names]
2086
-
2087
- x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
2088
- x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(x_true, method="dummy")
2089
- # Handle class imbalance using SMOTE (only for classification)
2090
- if (
2091
- smote
2092
- and purpose == "classification"
2093
- and y_train.value_counts(normalize=True).max() < 0.8
2094
- ):
2095
- from imblearn.over_sampling import SMOTE
2096
-
2097
- smote_sampler = SMOTE(random_state=random_state)
2098
- x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
2099
-
2100
- # Hyperparameter grids for tuning
2101
- if cv_level in ["low", "simple", "s", "l"]:
2102
- param_grids = {
2103
- "Random Forest": (
2104
- {
2105
- "n_estimators": [100], # One basic option
2106
- "max_depth": [None, 10],
2107
- "min_samples_split": [2],
2108
- "min_samples_leaf": [1],
2109
- "class_weight": [None],
2110
- }
2111
- if purpose == "classification"
2112
- else {
2113
- "n_estimators": [100], # One basic option
2114
- "max_depth": [None, 10],
2115
- "min_samples_split": [2],
2116
- "min_samples_leaf": [1],
2117
- "max_features": [None],
2118
- "bootstrap": [True], # Only one option for simplicity
2119
- }
2120
- ),
2121
- "SVM": {
2122
- "C": [1],
2123
- "gamma": ["scale"],
2124
- "kernel": ["rbf"],
2125
- },
2126
- "Lasso": {
2127
- "alpha": [0.1],
2128
- },
2129
- "LassoCV": {
2130
- "alphas": [[0.1]],
2131
- },
2132
- "Logistic Regression": {
2133
- "C": [1],
2134
- "solver": ["lbfgs"],
2135
- "penalty": ["l2"],
2136
- "max_iter": [500],
2137
- },
2138
- "Gradient Boosting": {
2139
- "n_estimators": [100],
2140
- "learning_rate": [0.1],
2141
- "max_depth": [3],
2142
- "min_samples_split": [2],
2143
- "subsample": [0.8],
2144
- },
2145
- "XGBoost": {
2146
- "n_estimators": [100],
2147
- "max_depth": [3],
2148
- "learning_rate": [0.1],
2149
- "subsample": [0.8],
2150
- "colsample_bytree": [0.8],
2151
- },
2152
- "KNN": (
2153
- {
2154
- "n_neighbors": [3],
2155
- "weights": ["uniform"],
2156
- "algorithm": ["auto"],
2157
- "p": [2],
2158
- }
2159
- if purpose == "classification"
2160
- else {
2161
- "n_neighbors": [3],
2162
- "weights": ["uniform"],
2163
- "metric": ["euclidean"],
2164
- "leaf_size": [30],
2165
- "p": [2],
2166
- }
2167
- ),
2168
- "Naive Bayes": {
2169
- "var_smoothing": [1e-9],
2170
- },
2171
- "SVR": {
2172
- "C": [1],
2173
- "gamma": ["scale"],
2174
- "kernel": ["rbf"],
2175
- },
2176
- "Linear Regression": {
2177
- "fit_intercept": [True],
2178
- },
2179
- "Extra Trees": {
2180
- "n_estimators": [100],
2181
- "max_depth": [None, 10],
2182
- "min_samples_split": [2],
2183
- "min_samples_leaf": [1],
2184
- },
2185
- "CatBoost": {
2186
- "iterations": [100],
2187
- "learning_rate": [0.1],
2188
- "depth": [3],
2189
- "l2_leaf_reg": [1],
2190
- },
2191
- "LightGBM": {
2192
- "n_estimators": [100],
2193
- "num_leaves": [31],
2194
- "max_depth": [10],
2195
- "min_data_in_leaf": [20],
2196
- "min_gain_to_split": [0.01],
2197
- "scale_pos_weight": [10],
2198
- },
2199
- "Bagging": {
2200
- "n_estimators": [50],
2201
- "max_samples": [0.7],
2202
- "max_features": [0.7],
2203
- },
2204
- "Neural Network": {
2205
- "hidden_layer_sizes": [(50,)],
2206
- "activation": ["relu"],
2207
- "solver": ["adam"],
2208
- "alpha": [0.0001],
2209
- },
2210
- "Decision Tree": {
2211
- "max_depth": [None, 10],
2212
- "min_samples_split": [2],
2213
- "min_samples_leaf": [1],
2214
- "criterion": ["gini"],
2215
- },
2216
- "AdaBoost": {
2217
- "n_estimators": [50],
2218
- "learning_rate": [0.5],
2219
- },
2220
- "Linear Discriminant Analysis": {
2221
- "solver": ["svd"],
2222
- "shrinkage": [None],
2223
- },
2224
- "Quadratic Discriminant Analysis": {
2225
- "reg_param": [0.0],
2226
- "priors": [None],
2227
- "tol": [1e-4],
2228
- },
2229
- "Ridge": (
2230
- {"class_weight": [None, "balanced"]}
2231
- if purpose == "classification"
2232
- else {
2233
- "alpha": [0.1, 1, 10],
2234
- }
2235
- ),
2236
- "Perceptron": {
2237
- "alpha": [1e-3],
2238
- "penalty": ["l2"],
2239
- "max_iter": [1000],
2240
- "eta0": [1.0],
2241
- },
2242
- "Bernoulli Naive Bayes": {
2243
- "alpha": [0.1, 1, 10],
2244
- "binarize": [0.0],
2245
- "fit_prior": [True],
2246
- },
2247
- "SGDClassifier": {
2248
- "eta0": [0.01],
2249
- "loss": ["hinge"],
2250
- "penalty": ["l2"],
2251
- "alpha": [1e-3],
2252
- "max_iter": [1000],
2253
- "tol": [1e-3],
2254
- "random_state": [random_state],
2255
- "learning_rate": ["constant"],
2256
- },
2257
- }
2258
- elif cv_level in ["high", "advanced", "h"]:
2259
- param_grids = {
2260
- "Random Forest": (
2261
- {
2262
- "n_estimators": [100, 200, 500, 700, 1000],
2263
- "max_depth": [None, 3, 5, 10, 15, 20, 30],
2264
- "min_samples_split": [2, 5, 10, 20],
2265
- "min_samples_leaf": [1, 2, 4],
2266
- "class_weight": (
2267
- [None, "balanced"] if purpose == "classification" else {}
2268
- ),
2269
- }
2270
- if purpose == "classification"
2271
- else {
2272
- "n_estimators": [100, 200, 500, 700, 1000],
2273
- "max_depth": [None, 3, 5, 10, 15, 20, 30],
2274
- "min_samples_split": [2, 5, 10, 20],
2275
- "min_samples_leaf": [1, 2, 4],
2276
- "max_features": [
2277
- "auto",
2278
- "sqrt",
2279
- "log2",
2280
- ], # Number of features to consider when looking for the best split
2281
- "bootstrap": [
2282
- True,
2283
- False,
2284
- ], # Whether bootstrap samples are used when building trees
2285
- }
2286
- ),
2287
- "SVM": {
2288
- "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
2289
- "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
2290
- "kernel": ["linear", "rbf", "poly"],
2291
- },
2292
- "Logistic Regression": {
2293
- "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
2294
- "solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
2295
- "penalty": ["l1", "l2", "elasticnet"],
2296
- "max_iter": [100, 200, 300, 500],
2297
- },
2298
- "Lasso": {
2299
- "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
2300
- "max_iter": [500, 1000, 2000, 5000],
2301
- "tol": [1e-4, 1e-5, 1e-6],
2302
- "selection": ["cyclic", "random"],
2303
- },
2304
- "LassoCV": {
2305
- "alphas": [[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
2306
- "max_iter": [500, 1000, 2000, 5000],
2307
- "cv": [3, 5, 10],
2308
- "tol": [1e-4, 1e-5, 1e-6],
2309
- },
2310
- "Gradient Boosting": {
2311
- "n_estimators": [100, 200, 300, 400, 500, 700, 1000],
2312
- "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3, 0.5],
2313
- "max_depth": [3, 5, 7, 9, 15],
2314
- "min_samples_split": [2, 5, 10, 20],
2315
- "subsample": [0.8, 1.0],
2316
- },
2317
- "XGBoost": {
2318
- "n_estimators": [100, 200, 500, 700],
2319
- "max_depth": [3, 5, 7, 10],
2320
- "learning_rate": [0.01, 0.1, 0.2, 0.3],
2321
- "subsample": [0.8, 1.0],
2322
- "colsample_bytree": [0.8, 0.9, 1.0],
2323
- },
2324
- "KNN": (
2325
- {
2326
- "n_neighbors": [1, 3, 5, 10, 15, 20],
2327
- "weights": ["uniform", "distance"],
2328
- "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
2329
- "p": [1, 2], # 1 for Manhattan, 2 for Euclidean distance
2330
- }
2331
- if purpose == "classification"
2332
- else {
2333
- "n_neighbors": [3, 5, 7, 9, 11], # Number of neighbors
2334
- "weights": [
2335
- "uniform",
2336
- "distance",
2337
- ], # Weight function used in prediction
2338
- "metric": [
2339
- "euclidean",
2340
- "manhattan",
2341
- "minkowski",
2342
- ], # Distance metric
2343
- "leaf_size": [
2344
- 20,
2345
- 30,
2346
- 40,
2347
- 50,
2348
- ], # Leaf size for KDTree or BallTree algorithms
2349
- "p": [
2350
- 1,
2351
- 2,
2352
- ], # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
2353
- }
2354
- ),
2355
- "Naive Bayes": {
2356
- "var_smoothing": [1e-10, 1e-9, 1e-8, 1e-7],
2357
- },
2358
- "AdaBoost": {
2359
- "n_estimators": [50, 100, 200, 300, 500],
2360
- "learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
2361
- },
2362
- "SVR": {
2363
- "C": [0.01, 0.1, 1, 10, 100, 1000],
2364
- "gamma": [0.001, 0.01, 0.1, "scale", "auto"],
2365
- "kernel": ["linear", "rbf", "poly"],
2366
- },
2367
- "Linear Regression": {
2368
- "fit_intercept": [True, False],
2369
- },
2370
- "Lasso": {
2371
- "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
2372
- "max_iter": [1000, 2000], # Higher iteration limit for fine-tuning
2373
- },
2374
- "Extra Trees": {
2375
- "n_estimators": [100, 200, 500, 700, 1000],
2376
- "max_depth": [None, 5, 10, 15, 20, 30],
2377
- "min_samples_split": [2, 5, 10, 20],
2378
- "min_samples_leaf": [1, 2, 4],
2379
- },
2380
- "CatBoost": {
2381
- "iterations": [100, 200, 500],
2382
- "learning_rate": [0.001, 0.01, 0.1, 0.2],
2383
- "depth": [3, 5, 7, 10],
2384
- "l2_leaf_reg": [1, 3, 5, 7, 10],
2385
- "border_count": [32, 64, 128],
2386
- },
2387
- "LightGBM": {
2388
- "n_estimators": [100, 200, 500, 700, 1000],
2389
- "learning_rate": [0.001, 0.01, 0.1, 0.2],
2390
- "num_leaves": [31, 50, 100, 200],
2391
- "max_depth": [-1, 5, 10, 20, 30],
2392
- "min_child_samples": [5, 10, 20],
2393
- "subsample": [0.8, 1.0],
2394
- "colsample_bytree": [0.8, 0.9, 1.0],
2395
- },
2396
- "Neural Network": {
2397
- "hidden_layer_sizes": [(50,), (100,), (100, 50), (200, 100)],
2398
- "activation": ["relu", "tanh", "logistic"],
2399
- "solver": ["adam", "sgd", "lbfgs"],
2400
- "alpha": [0.0001, 0.001, 0.01],
2401
- "learning_rate": ["constant", "adaptive"],
2402
- },
2403
- "Decision Tree": {
2404
- "max_depth": [None, 5, 10, 20, 30],
2405
- "min_samples_split": [2, 5, 10, 20],
2406
- "min_samples_leaf": [1, 2, 5, 10],
2407
- "criterion": ["gini", "entropy"],
2408
- "splitter": ["best", "random"],
2409
- },
2410
- "Linear Discriminant Analysis": {
2411
- "solver": ["svd", "lsqr", "eigen"],
2412
- "shrinkage": [
2413
- None,
2414
- "auto",
2415
- 0.1,
2416
- 0.5,
2417
- 1.0,
2418
- ], # shrinkage levels for 'lsqr' and 'eigen'
2419
- },
2420
- "Ridge": (
2421
- {"class_weight": [None, "balanced"]}
2422
- if purpose == "classification"
2423
- else {
2424
- "alpha": [0.1, 1, 10, 100, 1000],
2425
- "solver": ["auto", "svd", "cholesky", "lsqr", "lbfgs"],
2426
- "fit_intercept": [
2427
- True,
2428
- False,
2429
- ], # Whether to calculate the intercept
2430
- "normalize": [
2431
- True,
2432
- False,
2433
- ], # If True, the regressors X will be normalized
2434
- }
2435
- ),
2436
- }
2437
- else: # median level
2438
- param_grids = {
2439
- "Random Forest": (
2440
- {
2441
- "n_estimators": [100, 200, 500],
2442
- "max_depth": [None, 10, 20, 30],
2443
- "min_samples_split": [2, 5, 10],
2444
- "min_samples_leaf": [1, 2, 4],
2445
- "class_weight": [None, "balanced"],
2446
- }
2447
- if purpose == "classification"
2448
- else {
2449
- "n_estimators": [100, 200, 500],
2450
- "max_depth": [None, 10, 20, 30],
2451
- "min_samples_split": [2, 5, 10],
2452
- "min_samples_leaf": [1, 2, 4],
2453
- "max_features": [
2454
- "auto",
2455
- "sqrt",
2456
- "log2",
2457
- ], # Number of features to consider when looking for the best split
2458
- "bootstrap": [
2459
- True,
2460
- False,
2461
- ], # Whether bootstrap samples are used when building trees
2462
- }
2463
- ),
2464
- "SVM": {
2465
- "C": [0.1, 1, 10, 100], # Regularization strength
2466
- "gamma": ["scale", "auto"], # Common gamma values
2467
- "kernel": ["rbf", "linear", "poly"],
2468
- },
2469
- "Logistic Regression": {
2470
- "C": [0.1, 1, 10, 100], # Regularization strength
2471
- "solver": ["lbfgs", "liblinear", "saga"], # Common solvers
2472
- "penalty": ["l2"], # L2 penalty is most common
2473
- "max_iter": [
2474
- 500,
2475
- 1000,
2476
- 2000,
2477
- ], # Increased max_iter for better convergence
2478
- },
2479
- "Lasso": {
2480
- "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
2481
- "max_iter": [500, 1000, 2000],
2482
- },
2483
- "LassoCV": {
2484
- "alphas": [[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
2485
- "max_iter": [500, 1000, 2000],
2486
- },
2487
- "Gradient Boosting": {
2488
- "n_estimators": [100, 200, 500],
2489
- "learning_rate": [0.01, 0.1, 0.2],
2490
- "max_depth": [3, 5, 7],
2491
- "min_samples_split": [2, 5, 10],
2492
- "subsample": [0.8, 1.0],
2493
- },
2494
- "XGBoost": {
2495
- "n_estimators": [100, 200, 500],
2496
- "max_depth": [3, 5, 7],
2497
- "learning_rate": [0.01, 0.1, 0.2],
2498
- "subsample": [0.8, 1.0],
2499
- "colsample_bytree": [0.8, 1.0],
2500
- },
2501
- "KNN": (
2502
- {
2503
- "n_neighbors": [3, 5, 7, 10],
2504
- "weights": ["uniform", "distance"],
2505
- "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
2506
- "p": [1, 2],
2507
- }
2508
- if purpose == "classification"
2509
- else {
2510
- "n_neighbors": [3, 5, 7, 9, 11], # Number of neighbors
2511
- "weights": [
2512
- "uniform",
2513
- "distance",
2514
- ], # Weight function used in prediction
2515
- "metric": [
2516
- "euclidean",
2517
- "manhattan",
2518
- "minkowski",
2519
- ], # Distance metric
2520
- "leaf_size": [
2521
- 20,
2522
- 30,
2523
- 40,
2524
- 50,
2525
- ], # Leaf size for KDTree or BallTree algorithms
2526
- "p": [
2527
- 1,
2528
- 2,
2529
- ], # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
2530
- }
2531
- ),
2532
- "Naive Bayes": {
2533
- "var_smoothing": [1e-9, 1e-8, 1e-7],
2534
- },
2535
- "SVR": {
2536
- "C": [0.1, 1, 10, 100],
2537
- "gamma": ["scale", "auto"],
2538
- "kernel": ["rbf", "linear"],
2539
- },
2540
- "Linear Regression": {
2541
- "fit_intercept": [True, False],
2542
- },
2543
- "Lasso": {
2544
- "alpha": [0.1, 1.0, 10.0],
2545
- "max_iter": [1000, 2000], # Sufficient iterations for convergence
2546
- },
2547
- "Extra Trees": {
2548
- "n_estimators": [100, 200, 500],
2549
- "max_depth": [None, 10, 20, 30],
2550
- "min_samples_split": [2, 5, 10],
2551
- "min_samples_leaf": [1, 2, 4],
2552
- },
2553
- "CatBoost": {
2554
- "iterations": [100, 200],
2555
- "learning_rate": [0.01, 0.1],
2556
- "depth": [3, 6, 10],
2557
- "l2_leaf_reg": [1, 3, 5, 7],
2558
- },
2559
- "LightGBM": {
2560
- "n_estimators": [100, 200, 500],
2561
- "learning_rate": [0.01, 0.1],
2562
- "num_leaves": [31, 50, 100],
2563
- "max_depth": [-1, 10, 20],
2564
- "min_data_in_leaf": [20], # Minimum samples in each leaf
2565
- "min_gain_to_split": [0.01], # Minimum gain to allow a split
2566
- "scale_pos_weight": [10], # Address class imbalance
2567
- },
2568
- "Bagging": {
2569
- "n_estimators": [10, 50, 100],
2570
- "max_samples": [0.5, 0.7, 1.0],
2571
- "max_features": [0.5, 0.7, 1.0],
2572
- },
2573
- "Neural Network": {
2574
- "hidden_layer_sizes": [(50,), (100,), (100, 50)],
2575
- "activation": ["relu", "tanh"],
2576
- "solver": ["adam", "sgd"],
2577
- "alpha": [0.0001, 0.001],
2578
- },
2579
- "Decision Tree": {
2580
- "max_depth": [None, 10, 20],
2581
- "min_samples_split": [2, 10],
2582
- "min_samples_leaf": [1, 4],
2583
- "criterion": ["gini", "entropy"],
2584
- },
2585
- "AdaBoost": {
2586
- "n_estimators": [50, 100],
2587
- "learning_rate": [0.5, 1.0],
2588
- },
2589
- "Linear Discriminant Analysis": {
2590
- "solver": ["svd", "lsqr", "eigen"],
2591
- "shrinkage": [None, "auto"],
2592
- },
2593
- "Quadratic Discriminant Analysis": {
2594
- "reg_param": [0.0, 0.1, 0.5, 1.0], # Regularization parameter
2595
- "priors": [None, [0.5, 0.5], [0.3, 0.7]], # Class priors
2596
- "tol": [
2597
- 1e-4,
2598
- 1e-3,
2599
- 1e-2,
2600
- ], # Tolerance value for the convergence of the algorithm
2601
- },
2602
- "Perceptron": {
2603
- "alpha": [1e-4, 1e-3, 1e-2], # Regularization parameter
2604
- "penalty": ["l2", "l1", "elasticnet"], # Regularization penalty
2605
- "max_iter": [1000, 2000], # Maximum number of iterations
2606
- "eta0": [1.0, 0.1], # Learning rate for gradient descent
2607
- "tol": [1e-3, 1e-4, 1e-5], # Tolerance for stopping criteria
2608
- "random_state": [random_state], # Random state for reproducibility
2609
- },
2610
- "Bernoulli Naive Bayes": {
2611
- "alpha": [0.1, 1.0, 10.0], # Additive (Laplace) smoothing parameter
2612
- "binarize": [
2613
- 0.0,
2614
- 0.5,
2615
- 1.0,
2616
- ], # Threshold for binarizing the input features
2617
- "fit_prior": [
2618
- True,
2619
- False,
2620
- ], # Whether to learn class prior probabilities
2621
- },
2622
- "SGDClassifier": {
2623
- "eta0": [0.01, 0.1, 1.0],
2624
- "loss": [
2625
- "hinge",
2626
- "log",
2627
- "modified_huber",
2628
- "squared_hinge",
2629
- "perceptron",
2630
- ], # Loss function
2631
- "penalty": ["l2", "l1", "elasticnet"], # Regularization penalty
2632
- "alpha": [1e-4, 1e-3, 1e-2], # Regularization strength
2633
- "l1_ratio": [0.15, 0.5, 0.85], # L1 ratio for elasticnet penalty
2634
- "max_iter": [1000, 2000], # Maximum number of iterations
2635
- "tol": [1e-3, 1e-4], # Tolerance for stopping criteria
2636
- "random_state": [random_state], # Random state for reproducibility
2637
- "learning_rate": [
2638
- "constant",
2639
- "optimal",
2640
- "invscaling",
2641
- "adaptive",
2642
- ], # Learning rate schedule
2643
- },
2644
- "Ridge": (
2645
- {"class_weight": [None, "balanced"]}
2646
- if purpose == "classification"
2647
- else {
2648
- "alpha": [0.1, 1, 10, 100],
2649
- "solver": [
2650
- "auto",
2651
- "svd",
2652
- "cholesky",
2653
- "lsqr",
2654
- ], # Solver for optimization
2655
- }
2656
- ),
2657
- }
2658
-
2659
- results = {}
2660
- # Use StratifiedKFold for classification and KFold for regression
2661
- cv = (
2662
- StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
2663
- if purpose == "classification"
2664
- else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
2665
- )
2666
-
2667
- # Train and validate each model
2668
- for name, clf in tqdm(
2669
- models.items(),
2670
- desc="models",
2671
- colour="green",
2672
- bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
2673
- ):
2674
- if verbose:
2675
- print(f"\nTraining and validating {name}:")
2676
-
2677
- # Grid search with KFold or StratifiedKFold
2678
- gs = GridSearchCV(
2679
- clf,
2680
- param_grid=param_grids.get(name, {}),
2681
- scoring=(
2682
- "roc_auc" if purpose == "classification" else "neg_mean_squared_error"
2683
- ),
2684
- cv=cv,
2685
- n_jobs=n_jobs,
2686
- verbose=verbose,
2687
- )
2688
- gs.fit(x_train, y_train)
2689
- best_clf = gs.best_estimator_
2690
- # make sure x_train and x_test has the same name
2691
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2692
- y_pred = best_clf.predict(x_true)
2693
-
2694
- # y_pred_proba
2695
- if hasattr(best_clf, "predict_proba"):
2696
- y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
2697
- elif hasattr(best_clf, "decision_function"):
2698
- # If predict_proba is not available, use decision_function (e.g., for SVM)
2699
- y_pred_proba = best_clf.decision_function(x_true)
2700
- # Ensure y_pred_proba is within 0 and 1 bounds
2701
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
2702
- y_pred_proba.max() - y_pred_proba.min()
2703
- )
2704
- else:
2705
- y_pred_proba = None # No probability output for certain models
2706
-
2707
- validation_scores = {}
2708
- if y_true is not None:
2709
- validation_scores = cal_metrics(
2710
- y_true,
2711
- y_pred,
2712
- y_pred_proba=y_pred_proba,
2713
- is_binary=is_binary,
2714
- purpose=purpose,
2715
- average="weighted",
2716
- )
2717
-
2718
- # Calculate ROC curve
2719
- # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
2720
- if y_pred_proba is not None:
2721
- # fpr, tpr, roc_auc = dict(), dict(), dict()
2722
- fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2723
- lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False)
2724
- roc_auc = auc(fpr, tpr)
2725
- roc_info = {
2726
- "fpr": fpr.tolist(),
2727
- "tpr": tpr.tolist(),
2728
- "auc": roc_auc,
2729
- "ci95": (lower_ci, upper_ci),
2730
- }
2731
- # precision-recall curve
2732
- precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
2733
- avg_precision_ = average_precision_score(y_true, y_pred_proba)
2734
- pr_info = {
2735
- "precision": precision_,
2736
- "recall": recall_,
2737
- "avg_precision": avg_precision_,
2738
- }
2739
- else:
2740
- roc_info, pr_info = None, None
2741
- if purpose == "classification":
2742
- results[name] = {
2743
- "best_clf": gs.best_estimator_,
2744
- "best_params": gs.best_params_,
2745
- "auc_indiv": [
2746
- gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2747
- for i in range(cv_folds)
2748
- ],
2749
- "scores": validation_scores,
2750
- "roc_curve": roc_info,
2751
- "pr_curve": pr_info,
2752
- "confusion_matrix": confusion_matrix(y_true, y_pred),
2753
- "predictions": y_pred.tolist(),
2754
- "predictions_proba": (
2755
- y_pred_proba.tolist() if y_pred_proba is not None else None
2756
- ),
2757
- }
2758
- else: # "regression"
2759
- results[name] = {
2760
- "best_clf": gs.best_estimator_,
2761
- "best_params": gs.best_params_,
2762
- "scores": validation_scores, # e.g., neg_MSE, R², etc.
2763
- "predictions": y_pred.tolist(),
2764
- "predictions_proba": (
2765
- y_pred_proba.tolist() if y_pred_proba is not None else None
2766
- ),
2767
- }
2768
-
2769
- else:
2770
- results[name] = {
2771
- "best_clf": gs.best_estimator_,
2772
- "best_params": gs.best_params_,
2773
- "scores": validation_scores,
2774
- "predictions": y_pred.tolist(),
2775
- "predictions_proba": (
2776
- y_pred_proba.tolist() if y_pred_proba is not None else None
2777
- ),
2778
- }
2779
-
2780
- # Convert results to DataFrame
2781
- df_results = pd.DataFrame.from_dict(results, orient="index")
2782
-
2783
- # sort
2784
- if y_true is not None and purpose == "classification":
2785
- df_scores = pd.DataFrame(
2786
- df_results["scores"].tolist(), index=df_results["scores"].index
2787
- ).sort_values(by="roc_auc", ascending=False)
2788
- df_results = df_results.loc[df_scores.index]
2789
-
2790
- if plot_:
2791
- from datetime import datetime
2792
-
2793
- now_ = datetime.now().strftime("%y%m%d_%H%M%S")
2794
- nexttile = plot.subplot(figsize=[12, 10])
2795
- plot.heatmap(df_scores, kind="direct", ax=nexttile())
2796
- plot.figsets(xangle=30)
2797
- if dir_save:
2798
- ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
2799
- if df_scores.shape[0] > 1: # draw cluster
2800
- plot.heatmap(df_scores, kind="direct", cluster=True)
2801
- plot.figsets(xangle=30)
2802
- if dir_save:
2803
- ips.figsave(dir_save + f"scores_clus{now_}.pdf")
2804
- if all([plot_, y_true is not None, purpose == "classification"]):
2805
- try:
2806
- if len(models) > 3:
2807
- plot_validate_features(df_results)
2808
- else:
2809
- plot_validate_features_single(df_results, figsize=(12, 4 * len(models)))
2810
- if dir_save:
2811
- ips.figsave(dir_save + f"validate_features{now_}.pdf")
2812
- except Exception as e:
2813
- print(f"Error: 在画图的过程中出现了问题:{e}")
2814
- return df_results
2815
-
2816
-
2817
- def cal_metrics(
2818
- y_true, y_pred, y_pred_proba=None, is_binary=True,purpose="regression", average="weighted"
2819
- ):
2820
- """
2821
- Calculate regression or classification metrics based on the purpose.
2822
-
2823
- Parameters:
2824
- - y_true: Array of true values.
2825
- - y_pred: Array of predicted labels for classification or predicted values for regression.
2826
- - y_pred_proba: Array of predicted probabilities for classification (optional).
2827
- - purpose: str, "regression" or "classification".
2828
- - average: str, averaging method for multi-class classification ("binary", "micro", "macro", "weighted", etc.).
2829
-
2830
- Returns:
2831
- - validation_scores: dict of computed metrics.
2832
- """
2833
- from sklearn.metrics import (
2834
- mean_squared_error,
2835
- mean_absolute_error,
2836
- mean_absolute_percentage_error,
2837
- explained_variance_score,
2838
- r2_score,
2839
- mean_squared_log_error,
2840
- accuracy_score,
2841
- precision_score,
2842
- recall_score,
2843
- f1_score,
2844
- roc_auc_score,
2845
- matthews_corrcoef,
2846
- confusion_matrix,
2847
- balanced_accuracy_score,
2848
- average_precision_score,
2849
- precision_recall_curve,
2850
- )
2851
-
2852
- validation_scores = {}
2853
-
2854
- if purpose == "regression":
2855
- y_true = np.asarray(y_true)
2856
- y_true = y_true.ravel()
2857
- y_pred = np.asarray(y_pred)
2858
- y_pred = y_pred.ravel()
2859
- # Regression metrics
2860
- validation_scores = {
2861
- "mse": mean_squared_error(y_true, y_pred),
2862
- "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
2863
- "mae": mean_absolute_error(y_true, y_pred),
2864
- "r2": r2_score(y_true, y_pred),
2865
- "mape": mean_absolute_percentage_error(y_true, y_pred),
2866
- "explained_variance": explained_variance_score(y_true, y_pred),
2867
- "mbd": np.mean(y_pred - y_true), # Mean Bias Deviation
2868
- }
2869
- # Check if MSLE can be calculated
2870
- if np.all(y_true >= 0) and np.all(y_pred >= 0): # Ensure no negative values
2871
- validation_scores["msle"] = mean_squared_log_error(y_true, y_pred)
2872
- else:
2873
- validation_scores["msle"] = "Cannot be calculated due to negative values"
2874
-
2875
- elif purpose == "classification":
2876
- # Classification metrics
2877
- validation_scores = {
2878
- "accuracy": accuracy_score(y_true, y_pred),
2879
- "precision": precision_score(y_true, y_pred, average=average),
2880
- "recall": recall_score(y_true, y_pred, average=average),
2881
- "f1": f1_score(y_true, y_pred, average=average),
2882
- "mcc": matthews_corrcoef(y_true, y_pred),
2883
- "specificity": None,
2884
- "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
2885
- }
2886
-
2887
- # Confusion matrix to calculate specificity
2888
- if is_binary:
2889
- tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
2890
- else:
2891
- cm=onfusion_matrix(y_true, y_pred)
2892
- validation_scores["specificity"] = (
2893
- tn / (tn + fp) if (tn + fp) > 0 else 0
2894
- ) # Specificity calculation
2895
-
2896
- if y_pred_proba is not None:
2897
- # Calculate ROC-AUC
2898
- validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
2899
- # PR-AUC (Precision-Recall AUC) calculation
2900
- validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
2901
- else:
2902
- raise ValueError(
2903
- "Invalid purpose specified. Choose 'regression' or 'classification'."
2904
- )
2905
-
2906
- return validation_scores