py2ls 0.2.4.3__py3-none-any.whl → 0.2.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py ADDED
@@ -0,0 +1,1094 @@
1
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,BaggingClassifier
2
+ from sklearn.svm import SVC
3
+ from sklearn.calibration import CalibratedClassifierCV
4
+ from sklearn.model_selection import GridSearchCV,StratifiedKFold
5
+ from sklearn.linear_model import LassoCV, LogisticRegression, Lasso, Ridge,RidgeClassifierCV, ElasticNet
6
+ from sklearn.feature_selection import RFE
7
+ from sklearn.naive_bayes import GaussianNB
8
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
9
+ import xgboost as xgb # Make sure you have xgboost installed
10
+
11
+ from sklearn.model_selection import train_test_split, cross_val_score
12
+ from sklearn.metrics import (accuracy_score, precision_score, recall_score,
13
+ f1_score, roc_auc_score, confusion_matrix,
14
+ matthews_corrcoef,roc_curve,auc,
15
+ balanced_accuracy_score,precision_recall_curve,average_precision_score)
16
+ from imblearn.over_sampling import SMOTE
17
+ from sklearn.pipeline import Pipeline
18
+ from collections import defaultdict
19
+ from sklearn.preprocessing import StandardScaler
20
+ from typing import Dict, Any, Optional,List
21
+ import numpy as np
22
+ import pandas as pd
23
+ from . import ips
24
+ from . import plot
25
+ import matplotlib.pyplot as plt
26
+ import seaborn as sns
27
+ plt.style.use("paper")
28
+ import logging
29
+ import warnings
30
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
+ logger = logging.getLogger()
32
+
33
+ # Ignore specific warnings (UserWarning in this case)
34
+ warnings.filterwarnings("ignore", category=UserWarning)
35
+ from sklearn.tree import DecisionTreeClassifier
36
+ from sklearn.neighbors import KNeighborsClassifier
37
+
38
+ def features_knn(X_train: pd.DataFrame, y_train: pd.Series, knn_params: dict) -> pd.DataFrame:
39
+ """
40
+ A distance-based classifier that assigns labels based on the majority label of nearest neighbors.
41
+ when to use:
42
+ Effective for small to medium datasets with a low number of features.
43
+ It does not directly provide feature importances but can be assessed through feature permutation or similar methods.
44
+ Recommended Use: Effective for datasets with low feature dimensionality and well-separated clusters.
45
+
46
+ Fits KNeighborsClassifier and approximates feature influence using permutation importance.
47
+ """
48
+ knn = KNeighborsClassifier(**knn_params)
49
+ knn.fit(X_train, y_train)
50
+ importances = permutation_importance(knn, X_train, y_train, n_repeats=30, random_state=1, scoring="accuracy")
51
+ return pd.DataFrame({"feature": X_train.columns, "importance": importances.importances_mean}).sort_values(by="importance", ascending=False)
52
+
53
+ #! 1. Linear and Regularized Regression Methods
54
+ # 1.1 Lasso
55
+ def features_lasso(X_train: pd.DataFrame, y_train: pd.Series, lasso_params: dict) -> np.ndarray:
56
+ """
57
+ Lasso (Least Absolute Shrinkage and Selection Operator):
58
+ A regularized linear regression method that uses L1 penalty to shrink coefficients, effectively
59
+ performing feature selection by zeroing out less important ones.
60
+ """
61
+ lasso = LassoCV(**lasso_params)
62
+ lasso.fit(X_train, y_train)
63
+ # Get non-zero coefficients and their corresponding features
64
+ coefficients = lasso.coef_
65
+ importance_df = pd.DataFrame({
66
+ "feature": X_train.columns,
67
+ "importance": np.abs(coefficients)
68
+ })
69
+ return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
70
+
71
+ # 1.2 Ridge regression
72
+ def features_ridge(X_train: pd.DataFrame, y_train: pd.Series, ridge_params: dict) -> np.ndarray:
73
+ """
74
+ Ridge Regression: A linear regression technique that applies L2 regularization, reducing coefficient
75
+ magnitudes to avoid overfitting, especially with multicollinearity among features.
76
+ """
77
+ from sklearn.linear_model import RidgeCV
78
+ ridge = RidgeCV(**ridge_params)
79
+ ridge.fit(X_train, y_train)
80
+
81
+ # Get the coefficients
82
+ coefficients = ridge.coef_
83
+
84
+ # Create a DataFrame to hold feature importance
85
+ importance_df = pd.DataFrame({
86
+ "feature": X_train.columns,
87
+ "importance": np.abs(coefficients)
88
+ })
89
+ return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
90
+
91
+ # 1.3 Elastic Net(Enet)
92
+ def features_enet(X_train: pd.DataFrame, y_train: pd.Series, enet_params: dict) -> np.ndarray:
93
+ """
94
+ Elastic Net (Enet): Combines L1 and L2 penalties (lasso and ridge) in a linear model, beneficial
95
+ when features are highly correlated or for datasets with more features than samples.
96
+ """
97
+ from sklearn.linear_model import ElasticNetCV
98
+ enet = ElasticNetCV(**enet_params)
99
+ enet.fit(X_train, y_train)
100
+ # Get the coefficients
101
+ coefficients = enet.coef_
102
+ # Create a DataFrame to hold feature importance
103
+ importance_df = pd.DataFrame({
104
+ "feature": X_train.columns,
105
+ "importance": np.abs(coefficients)
106
+ })
107
+ return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
108
+ # 1.4 Partial Least Squares Regression for Generalized Linear Models (plsRglm): Combines regression and
109
+ # feature reduction, useful for high-dimensional data with correlated features, such as genomics.
110
+
111
+ #! 2.Generalized Linear Models and Extensions
112
+ # 2.1
113
+
114
+ #!3.Tree-Based and Ensemble Methods
115
+ # 3.1 Random Forest(RF)
116
+ def features_rf(X_train: pd.DataFrame, y_train: pd.Series, rf_params: dict) -> np.ndarray:
117
+ """
118
+ An ensemble of decision trees that combines predictions from multiple trees for classification or
119
+ regression, effective with high-dimensional, complex datasets.
120
+ when to use:
121
+ Handles high-dimensional data well.
122
+ Robust to overfitting due to averaging of multiple trees.
123
+ Provides feature importance, which can help in understanding the influence of different genes.
124
+ Fit Random Forest and return sorted feature importances.
125
+ Recommended Use: Great for classification problems, especially when you have many features (genes).
126
+ """
127
+ rf = RandomForestClassifier(**rf_params)
128
+ rf.fit(X_train, y_train)
129
+ return pd.DataFrame({"feature": X_train.columns, "importance": rf.featuress_}).sort_values(by="importance", ascending=False)
130
+ # 3.2 Gradient Boosting Trees
131
+ def features_gradient_boosting(X_train: pd.DataFrame, y_train: pd.Series, gb_params: dict) -> pd.DataFrame:
132
+ """
133
+ An ensemble of decision trees that combines predictions from multiple trees for classification or regression, effective with
134
+ high-dimensional, complex datasets.
135
+ Gradient Boosting
136
+ Strengths:
137
+ High predictive accuracy and works well for both classification and regression.
138
+ Can handle a mixture of numerical and categorical features.
139
+ Recommended Use:
140
+ Effective for complex relationships and when you need a powerful predictive model.
141
+ Fit Gradient Boosting classifier and return sorted feature importances.
142
+ Recommended Use: Effective for complex datasets with many features (genes).
143
+ """
144
+ gb = GradientBoostingClassifier(**gb_params)
145
+ gb.fit(X_train, y_train)
146
+ return pd.DataFrame({"feature": X_train.columns, "importance": gb.feature_importances_}).sort_values(by="importance", ascending=False)
147
+ # 3.3 XGBoost
148
+ def features_xgb(X_train: pd.DataFrame, y_train: pd.Series, xgb_params: dict) -> pd.DataFrame:
149
+ """
150
+ XGBoost: An advanced gradient boosting technique, faster and more efficient than GBM, with excellent predictive performance on structured data.
151
+ """
152
+ import xgboost as xgb
153
+ xgb_model = xgb.XGBClassifier(**xgb_params)
154
+ xgb_model.fit(X_train, y_train)
155
+ return pd.DataFrame({"feature": X_train.columns, "importance": xgb_model.feature_importances_}).sort_values(by="importance", ascending=False)
156
+ # 3.4.decision tree
157
+ def features_decision_tree(X_train: pd.DataFrame, y_train: pd.Series, dt_params: dict) -> pd.DataFrame:
158
+ """
159
+ A single decision tree classifier effective for identifying key decision boundaries in data.
160
+ when to use:
161
+ Good for capturing non-linear patterns.
162
+ Provides feature importance scores for each feature, though it may overfit on small datasets.
163
+ Efficient for low to medium-sized datasets, where interpretability of decisions is key.
164
+ Recommended Use: Useful for interpretable feature importance analysis in smaller or balanced datasets.
165
+
166
+ Fits DecisionTreeClassifier and returns sorted feature importances.
167
+ """
168
+ dt = DecisionTreeClassifier(**dt_params)
169
+ dt.fit(X_train, y_train)
170
+ return pd.DataFrame({"feature": X_train.columns, "importance": dt.feature_importances_}).sort_values(by="importance", ascending=False)
171
+ # 3.5 bagging
172
+ def features_bagging(X_train: pd.DataFrame, y_train: pd.Series, bagging_params: dict) -> pd.DataFrame:
173
+ """
174
+ A bagging ensemble of classifiers, often used with weak learners like decision trees, to reduce variance.
175
+ when to use:
176
+ Helps reduce overfitting, especially on high-variance models.
177
+ Effective when the dataset has numerous features and may benefit from ensemble stability.
178
+ Recommended Use: Beneficial for high-dimensional or noisy datasets needing ensemble stability.
179
+
180
+ Fits BaggingClassifier and returns averaged feature importances from underlying estimators if available.
181
+ """
182
+ bagging = BaggingClassifier(**bagging_params)
183
+ bagging.fit(X_train, y_train)
184
+
185
+ # Calculate feature importance by averaging importances across estimators, if feature_importances_ is available.
186
+ if hasattr(bagging.estimators_[0], "feature_importances_"):
187
+ importances = np.mean([estimator.feature_importances_ for estimator in bagging.estimators_], axis=0)
188
+ return pd.DataFrame({"feature": X_train.columns, "importance": importances}).sort_values(by="importance", ascending=False)
189
+ else:
190
+ # If the base estimator does not support feature importances, fallback to permutation importance.
191
+ importances = permutation_importance(bagging, X_train, y_train, n_repeats=30, random_state=1, scoring="accuracy")
192
+ return pd.DataFrame({"feature": X_train.columns, "importance": importances.importances_mean}).sort_values(by="importance", ascending=False)
193
+
194
+ #! 4.Support Vector Machines
195
+ def features_svm(X_train: pd.DataFrame, y_train: pd.Series, rfe_params: dict) -> np.ndarray:
196
+ """
197
+ Suitable for classification tasks where the number of features is much larger than the number of samples.
198
+ 1. Effective in high-dimensional spaces and with clear margin of separation.
199
+ 2. Works well for both linear and non-linear classification (using kernel functions).
200
+ Select features using RFE with SVM.When combined with SVM, RFE selects features that are most critical for the decision boundary,
201
+ helping reduce the dataset to a more manageable size without losing much predictive power.
202
+ SVM (Support Vector Machines),supports various kernels (linear, rbf, poly, and sigmoid), is good at handling high-dimensional
203
+ data and finding an optimal decision boundary between classes, especially when using the right kernel.
204
+ kernel: ["linear", "rbf", "poly", "sigmoid"]
205
+ 'linear': simplest kernel that attempts to separate data by drawing a straight line (or hyperplane) between classes. It is effective
206
+ when the data is linearly separable, meaning the classes can be well divided by a straight boundary.
207
+ Advantages:
208
+ - Computationally efficient for large datasets.
209
+ - Works well when the number of features is high, which is common in genomic data where you may have thousands of genes
210
+ as features.
211
+ 'rbf': a nonlinear kernel that maps the input data into a higher-dimensional space to find a decision boundary. It works well for
212
+ data that is not linearly separable in its original space.
213
+ Advantages:
214
+ - Handles nonlinear relationships between features and classes
215
+ - Often better than a linear kernel when there is no clear linear decision boundary in the data.
216
+ 'poly': Polynomial Kernel: computes similarity between data points based on polynomial functions of the input features. It can model
217
+ interactions between features to a certain degree, depending on the polynomial degree chosen.
218
+ Advantages:
219
+ - Allows modeling of feature interactions.
220
+ - Can fit more complex relationships compared to linear models.
221
+ 'sigmoid': similar to the activation function in neural networks, and it works well when the data follows an S-shaped decision boundary.
222
+ Advantages:
223
+ - Can approximate the behavior of neural networks.
224
+ - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
225
+ S-shaped relationships.
226
+ """
227
+ # SVM (Support Vector Machines)
228
+ svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
229
+ # RFE(Recursive Feature Elimination)
230
+ selector = RFE(svc, n_features_to_select=rfe_params["n_features_to_select"])
231
+ selector.fit(X_train, y_train)
232
+ return X_train.columns[selector.support_]
233
+ #! 5.Bayesian and Probabilistic Methods
234
+ def features_naive_bayes(X_train: pd.DataFrame, y_train: pd.Series) -> list:
235
+ """
236
+ Naive Bayes: A probabilistic classifier based on Bayes' theorem, assuming independence between features, simple and fast, especially
237
+ effective for text classification and other high-dimensional data.
238
+ """
239
+ from sklearn.naive_bayes import GaussianNB
240
+ nb = GaussianNB()
241
+ nb.fit(X_train, y_train)
242
+ probabilities = nb.predict_proba(X_train)
243
+ return X_train.columns[np.argsort(probabilities.max(axis=1))[:X_train.shape[1] // 2]]
244
+ #! 6.Linear Discriminant Analysis (LDA)
245
+ def features_lda(X_train: pd.DataFrame, y_train: pd.Series) -> list:
246
+ """
247
+ Linear Discriminant Analysis (LDA): Projects data onto a lower-dimensional space to maximize class separability, often used as a dimensionality
248
+ reduction technique before classification on high-dimensional data.
249
+ """
250
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
251
+ lda = LinearDiscriminantAnalysis()
252
+ lda.fit(X_train, y_train)
253
+ coef = lda.coef_.flatten()
254
+ # Create a DataFrame to hold feature importance
255
+ importance_df = pd.DataFrame({
256
+ "feature": X_train.columns,
257
+ "importance": np.abs(coef)
258
+ })
259
+
260
+ return importance_df[importance_df["importance"] > 0].sort_values(by="importance", ascending=False)
261
+
262
+ def features_adaboost(X_train: pd.DataFrame, y_train: pd.Series, adaboost_params: dict) -> pd.DataFrame:
263
+ """
264
+ AdaBoost
265
+ Strengths:
266
+ Combines multiple weak learners to create a strong classifier.
267
+ Focuses on examples that are hard to classify, improving overall performance.
268
+ Recommended Use:
269
+ Can be effective for boosting weak classifiers in a genomics context.
270
+ Fit AdaBoost classifier and return sorted feature importances.
271
+ Recommended Use: Great for classification problems with a large number of features (genes).
272
+ """
273
+ ada = AdaBoostClassifier(**adaboost_params)
274
+ ada.fit(X_train, y_train)
275
+ return pd.DataFrame({"feature": X_train.columns, "importance": ada.feature_importances_}).sort_values(by="importance", ascending=False)
276
+
277
+ import torch
278
+ import torch.nn as nn
279
+ import torch.optim as optim
280
+ from torch.utils.data import DataLoader, TensorDataset
281
+ from skorch import NeuralNetClassifier # sklearn compatible
282
+
283
+ class DNNClassifier(nn.Module):
284
+ def __init__(self, input_dim, hidden_dim=128, output_dim=2, dropout_rate=0.5):
285
+ super(DNNClassifier, self).__init__()
286
+
287
+ self.hidden_layer1 = nn.Sequential(
288
+ nn.Linear(input_dim, hidden_dim),
289
+ nn.ReLU(),
290
+ nn.Dropout(dropout_rate),
291
+ nn.Linear(hidden_dim, hidden_dim),
292
+ nn.ReLU()
293
+ )
294
+
295
+ self.hidden_layer2 = nn.Sequential(
296
+ nn.Linear(hidden_dim, hidden_dim),
297
+ nn.ReLU(),
298
+ nn.Dropout(dropout_rate)
299
+ )
300
+
301
+ # Adding a residual connection between hidden layers
302
+ self.residual = nn.Linear(input_dim, hidden_dim)
303
+
304
+ self.output_layer = nn.Sequential(
305
+ nn.Linear(hidden_dim, output_dim),
306
+ nn.Softmax(dim=1)
307
+ )
308
+
309
+ def forward(self, x):
310
+ residual = self.residual(x)
311
+ x = self.hidden_layer1(x)
312
+ x = x + residual # Residual connection
313
+ x = self.hidden_layer2(x)
314
+ x = self.output_layer(x)
315
+ return x
316
+
317
+ def validate_classifier(clf, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, metrics: list=["accuracy", "precision", "recall", "f1", "roc_auc"] , cv_folds: int=5) -> dict:
318
+ """
319
+ Perform cross-validation for a given classifier and return average scores for specified metrics on training data.
320
+ Then fit the best model on the full training data and evaluate it on the test set.
321
+
322
+ Parameters:
323
+ - clf: The classifier to be validated.
324
+ - X_train: Training features.
325
+ - y_train: Training labels.
326
+ - X_test: Test features.
327
+ - y_test: Test labels.
328
+ - metrics: List of metrics to evaluate (e.g., ['accuracy', 'roc_auc']).
329
+ - cv_folds: Number of cross-validation folds.
330
+
331
+ Returns:
332
+ - results: Dictionary containing average cv_train_scores and cv_test_scores.
333
+ """
334
+ cv_train_scores = {metric: [] for metric in metrics}
335
+ skf = StratifiedKFold(n_splits=cv_folds)
336
+ # Perform cross-validation
337
+ for metric in metrics:
338
+ try:
339
+ if metric == "roc_auc" and len(set(y_train)) == 2:
340
+ scores = cross_val_score(clf, X_train, y_train, cv=skf, scoring="roc_auc")
341
+ cv_train_scores[metric] = np.nanmean(scores) if not np.isnan(scores).all() else float('nan')
342
+ else:
343
+ score = cross_val_score(clf, X_train, y_train, cv=skf, scoring=metric)
344
+ cv_train_scores[metric] = score.mean()
345
+ except Exception as e:
346
+ cv_train_scores[metric] = float('nan')
347
+ clf.fit(X_train, y_train)
348
+
349
+ # Evaluate on the test set
350
+ cv_test_scores = {}
351
+ for metric in metrics:
352
+ if metric == "roc_auc" and len(set(y_test)) == 2:
353
+ try:
354
+ y_prob=clf.predict_proba(X_test)[:, 1]
355
+ cv_test_scores[metric] = roc_auc_score(y_test,y_prob)
356
+ except AttributeError:
357
+ cv_test_scores[metric]=float('nan')
358
+ else:
359
+ score_func = globals().get(f'{metric}_score') # Fetching the appropriate scoring function
360
+ if score_func:
361
+ try:
362
+ y_pred = clf.predict(X_test)
363
+ cv_test_scores[metric] = score_func(y_test, y_pred)
364
+ except Exception as e:
365
+ cv_test_scores[metric] = float('nan')
366
+
367
+ # Combine results
368
+ results = {
369
+ 'cv_train_scores': cv_train_scores,
370
+ 'cv_test_scores': cv_test_scores
371
+ }
372
+ return results
373
+
374
+ def get_classifiers(
375
+ random_state=1,
376
+ cls=[
377
+ "lasso",
378
+ "ridge",
379
+ "Elastic Net(Enet)",
380
+ "gradient Boosting",
381
+ "Random forest (rf)",
382
+ "XGBoost (xgb)",
383
+ "Support Vector Machine(svm)",
384
+ "naive bayes",
385
+ "Linear Discriminant Analysis (lda)",
386
+ "adaboost","DecisionTree","KNeighbors","Bagging"
387
+ ],
388
+ ):
389
+ from sklearn.ensemble import (
390
+ RandomForestClassifier,
391
+ GradientBoostingClassifier,
392
+ AdaBoostClassifier,
393
+ BaggingClassifier
394
+ )
395
+ from sklearn.svm import SVC
396
+ from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifierCV, ElasticNet
397
+ from sklearn.naive_bayes import GaussianNB
398
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
399
+ import xgboost as xgb
400
+ from sklearn.tree import DecisionTreeClassifier
401
+ from sklearn.neighbors import KNeighborsClassifier
402
+ res_cls = {}
403
+ classifiers_all = {
404
+ "Lasso": LogisticRegression(penalty='l1', solver='saga', random_state=random_state),
405
+ "Ridge": RidgeClassifierCV(),
406
+ "Elastic Net (Enet)": ElasticNet(random_state=random_state),
407
+ "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
408
+ "Random Forest (RF)": RandomForestClassifier(random_state=random_state),
409
+ "XGBoost (XGB)": xgb.XGBClassifier(random_state=random_state),
410
+ "Support Vector Machine (SVM)": SVC(kernel="rbf", probability=True),
411
+ "Naive Bayes": GaussianNB(),
412
+ "Linear Discriminant Analysis (LDA)": LinearDiscriminantAnalysis(),
413
+ "AdaBoost": AdaBoostClassifier(random_state=random_state, algorithm="SAMME"),
414
+ "DecisionTree":DecisionTreeClassifier(),
415
+ "KNeighbors": KNeighborsClassifier(n_neighbors=5),
416
+ "Bagging": BaggingClassifier(),
417
+ }
418
+ print("Using default classifiers:")
419
+ for cls_name in cls:
420
+ cls_name = ips.strcmp(cls_name, list(classifiers_all.keys()))[0]
421
+ res_cls[cls_name] = classifiers_all[cls_name]
422
+ print(f"- {cls_name}")
423
+ return res_cls
424
+
425
+ def get_features(
426
+ X: pd.DataFrame,
427
+ y: pd.Series,
428
+ test_size: float = 0.2,
429
+ random_state: int = 1,
430
+ n_features: int = 10,
431
+ rf_params: Optional[Dict] = None,
432
+ rfe_params: Optional[Dict] = None,
433
+ lasso_params: Optional[Dict] = None,
434
+ ridge_params: Optional[Dict] = None,
435
+ enet_params: Optional[Dict] = None,
436
+ gb_params: Optional[Dict] = None,
437
+ adaboost_params: Optional[Dict] = None,
438
+ xgb_params: Optional[Dict] = None,
439
+ dt_params: Optional[Dict] = None,
440
+ bagging_params: Optional[Dict] = None,
441
+ knn_params: Optional[Dict] = None,
442
+ cls: list=[
443
+ "lasso",
444
+ "ridge",
445
+ "Elastic Net(Enet)",
446
+ "gradient Boosting",
447
+ "Random forest (rf)",
448
+ "XGBoost (xgb)",
449
+ "Support Vector Machine(svm)",
450
+ "naive bayes",
451
+ "Linear Discriminant Analysis (lda)",
452
+ "adaboost","DecisionTree","KNeighbors","Bagging"
453
+ ],
454
+ metrics: Optional[List[str]] = None,
455
+ cv_folds: int = 5,
456
+ strict:bool=False,
457
+ n_shared:int=2, # 只要有两个方法有重合,就纳入common genes
458
+ use_selected_features: bool = True,
459
+ ) -> dict:
460
+ """
461
+ Master function to perform feature selection and validate classifiers.
462
+ """
463
+ # Split data into training and test sets
464
+ X_train, X_test, y_train, y_test = train_test_split(
465
+ X, y, test_size=test_size, random_state=random_state
466
+ )
467
+ # Standardize features
468
+ scaler = StandardScaler()
469
+ X_train_scaled = scaler.fit_transform(X_train)
470
+ X_test_scaled = scaler.transform(X_test)
471
+
472
+ # Convert back to DataFrame for consistency
473
+ X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
474
+ X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)
475
+
476
+ rf_defaults = {"n_estimators": 100, "random_state": random_state}
477
+ rfe_defaults = {"kernel": "linear", "n_features_to_select": n_features}
478
+ lasso_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
479
+ ridge_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
480
+ enet_defaults = {"alphas": np.logspace(-4, 4, 100), "cv": 10}
481
+ xgb_defaults = {"n_estimators": 100, "use_label_encoder": False, "eval_metric": "logloss", "random_state": random_state}
482
+ gb_defaults = {"n_estimators": 100, "random_state": random_state}
483
+ adaboost_defaults = {"n_estimators": 50, "random_state": random_state}
484
+ dt_defaults = {"max_depth": None, "random_state": random_state}
485
+ bagging_defaults = {"n_estimators": 50, "random_state": random_state}
486
+ knn_defaults = {"n_neighbors": 5}
487
+ rf_params, rfe_params = rf_params or rf_defaults, rfe_params or rfe_defaults
488
+ lasso_params, ridge_params = lasso_params or lasso_defaults, ridge_params or ridge_defaults
489
+ enet_params, xgb_params = enet_params or enet_defaults, xgb_params or xgb_defaults
490
+ gb_params, adaboost_params = gb_params or gb_defaults, adaboost_params or adaboost_defaults
491
+ dt_params = dt_params or dt_defaults
492
+ bagging_params = bagging_params or bagging_defaults
493
+ knn_params = knn_params or knn_defaults
494
+
495
+ cls_ = ["lasso",'ridge','Elastic Net(Enet)',"Gradient Boosting","Random Forest (rf)",
496
+ 'XGBoost (xgb)','Support Vector Machine(svm)','Naive Bayes','Linear Discriminant Analysis (lda)','adaboost']
497
+ cls=[ips.strcmp(i,cls_)[0] for i in cls]
498
+
499
+ # Lasso Feature Selection
500
+ lasso_importances = features_lasso(X_train, y_train, lasso_params) if 'lasso'in cls else pd.DataFrame()
501
+ lasso_selected_features= lasso_importances.head(n_features)["feature"].values if 'lasso'in cls else []
502
+ # Ridge
503
+ ridge_importances=features_ridge(X_train, y_train,ridge_params) if 'ridge'in cls else pd.DataFrame()
504
+ selected_ridge_features= ridge_importances.head(n_features)["feature"].values if 'ridge'in cls else []
505
+ # Elastic Net
506
+ enet_importances=features_enet(X_train, y_train,enet_params) if 'Enet'in cls else pd.DataFrame()
507
+ selected_enet_features= enet_importances.head(n_features)["feature"].values if 'Enet'in cls else []
508
+ # Random Forest Feature Importance
509
+ rf_importances = features_rf(X_train, y_train, rf_params) if 'Random Forest'in cls else pd.DataFrame()
510
+ top_rf_features = rf_importances.head(n_features)["feature"].values if 'Random Forest'in cls else []
511
+ # Gradient Boosting Feature Importance
512
+ gb_importances = features_gradient_boosting(X_train, y_train, gb_params) if 'Gradient Boosting'in cls else pd.DataFrame()
513
+ top_gb_features = gb_importances.head(n_features)["feature"].values if 'Gradient Boosting'in cls else []
514
+ # xgb
515
+ xgb_importances = features_xgb(X_train, y_train,xgb_params) if 'xgb'in cls else pd.DataFrame()
516
+ top_xgb_features = xgb_importances.head(n_features)["feature"].values if 'xgb'in cls else []
517
+
518
+ # SVM with RFE
519
+ selected_svm_features = features_svm(X_train, y_train, rfe_params) if 'svm'in cls else []
520
+ # Naive Bayes
521
+ selected_naive_bayes_features=features_naive_bayes(X_train, y_train) if 'Naive Bayes'in cls else []
522
+ # lda: linear discriminant analysis
523
+ lda_importances=features_lda(X_train, y_train) if 'lda'in cls else pd.DataFrame()
524
+ selected_lda_features= lda_importances.head(n_features)["feature"].values if 'lda'in cls else []
525
+ # AdaBoost Feature Importance
526
+ adaboost_importances = features_adaboost(X_train, y_train, adaboost_params) if 'AdaBoost'in cls else pd.DataFrame()
527
+ top_adaboost_features = adaboost_importances.head(n_features)["feature"].values if 'AdaBoost'in cls else []
528
+ # Decision Tree Feature Importance
529
+ dt_importances = features_decision_tree(X_train, y_train, dt_params) if 'Decision Tree' in cls else pd.DataFrame()
530
+ top_dt_features = dt_importances.head(n_features)["feature"].values if 'Decision Tree' in cls else []
531
+ # Bagging Feature Importance
532
+ bagging_importances = features_bagging(X_train, y_train, bagging_params) if 'Bagging' in cls else pd.DataFrame()
533
+ top_bagging_features = bagging_importances.head(n_features)["feature"].values if 'Bagging' in cls else []
534
+ # KNN Feature Importance via Permutation
535
+ knn_importances = features_knn(X_train, y_train, knn_params) if 'KNN' in cls else pd.DataFrame()
536
+ top_knn_features = knn_importances.head(n_features)["feature"].values if 'KNN' in cls else []
537
+
538
+ #! Find common features
539
+ common_features = ips.shared(lasso_selected_features,selected_ridge_features, selected_enet_features,
540
+ top_rf_features,top_gb_features,top_xgb_features,
541
+ selected_svm_features, selected_naive_bayes_features,selected_lda_features,
542
+ top_adaboost_features,top_dt_features, top_bagging_features, top_knn_features,
543
+ strict=strict,
544
+ n_shared=n_shared
545
+ )
546
+
547
+ # Use selected features or all features for model validation
548
+ X_train_selected = X_train[list(common_features)] if use_selected_features else X_train
549
+ X_test_selected = X_test[list(common_features)] if use_selected_features else X_test
550
+
551
+ if metrics is None:
552
+ metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
553
+
554
+ # Prepare results DataFrame for selected features
555
+ features_df = pd.DataFrame({
556
+ 'type':
557
+ ['Lasso'] * len(lasso_selected_features)+
558
+ ['Ridge'] * len(selected_ridge_features)+
559
+ ['Random Forest'] * len(top_rf_features) +
560
+ ['Gradient Boosting'] * len(top_gb_features)+
561
+ ["Enet"]*len(selected_enet_features)+
562
+ ['xgb'] * len(top_xgb_features)+
563
+ ['SVM'] * len(selected_svm_features) +
564
+ ['Naive Bayes'] * len(selected_naive_bayes_features)+
565
+ ['Linear Discriminant Analysis'] * len(selected_lda_features)+
566
+ ['AdaBoost'] * len(top_adaboost_features)+
567
+ ['Decision Tree'] * len(top_dt_features) +
568
+ ['Bagging'] * len(top_bagging_features) +
569
+ ['KNN'] * len(top_knn_features),
570
+ 'feature': np.concatenate([lasso_selected_features,selected_ridge_features,
571
+ top_rf_features,top_gb_features,selected_enet_features,top_xgb_features,
572
+ selected_svm_features,selected_naive_bayes_features,
573
+ selected_lda_features,top_adaboost_features,top_dt_features,
574
+ top_bagging_features, top_knn_features
575
+ ])
576
+ })
577
+
578
+ #! Validate trained each classifier
579
+ classifiers=get_classifiers(random_state=random_state,cls=cls)
580
+ cv_train_results,cv_test_results = [],[]
581
+ for name, clf in classifiers.items():
582
+ if not X_train_selected.empty:
583
+ cv_scores=validate_classifier(clf,
584
+ X_train_selected,
585
+ y_train,
586
+ X_test_selected,
587
+ y_test,
588
+ metrics=metrics,
589
+ cv_folds=cv_folds)
590
+
591
+ cv_train_score_df = pd.DataFrame(cv_scores["cv_train_scores"], index=[name])
592
+ cv_test_score_df = pd.DataFrame(cv_scores["cv_test_scores"], index=[name])
593
+ cv_train_results.append(cv_train_score_df)
594
+ cv_test_results.append(cv_test_score_df)
595
+ if all([cv_train_results,cv_train_results]):
596
+ cv_train_results_df = pd.concat(cv_train_results).reset_index().rename(columns={'index': 'Classifier'})
597
+ cv_test_results_df = pd.concat(cv_test_results).reset_index().rename(columns={'index': 'Classifier'})
598
+ #! Store results in the main results dictionary
599
+ results = {
600
+ "selected_features": features_df,
601
+ "cv_train_scores": cv_train_results_df,
602
+ "cv_test_scores": cv_test_results_df,
603
+ "common_features": list(common_features),
604
+ }
605
+ else:
606
+ results = {
607
+ "selected_features": pd.DataFrame(),
608
+ "cv_train_scores": pd.DataFrame(),
609
+ "cv_test_scores": pd.DataFrame(),
610
+ "common_features": [],
611
+ }
612
+ print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
613
+ return results
614
+ #! # usage:
615
+ # # Get features and common features
616
+ # results = get_features(X, y)
617
+ # common_features = results["common_features"]
618
+ def validate_features(
619
+ X_train: pd.DataFrame,
620
+ y_train: pd.Series,
621
+ X_true: pd.DataFrame,
622
+ y_true: pd.Series,
623
+ common_features:set=None,
624
+ classifiers: Optional[Dict[str, Any]] = None,
625
+ metrics: Optional[list] = None,
626
+ random_state: int = 1,
627
+ smote: bool = False,
628
+ plot_: bool = True,
629
+ class_weight: str = "balanced",
630
+ ) -> dict:
631
+ """
632
+ Validate classifiers using selected features on the validation dataset.
633
+
634
+ Parameters:
635
+ - X_train (pd.DataFrame): Training feature dataset.
636
+ - y_train (pd.Series): Training target variable.
637
+ - X_true (pd.DataFrame): Validation feature dataset.
638
+ - y_true (pd.Series): Validation target variable.
639
+ - common_features (set): Set of common features to use for validation.
640
+ - classifiers (dict, optional): Dictionary of classifiers to validate.
641
+ - metrics (list, optional): List of metrics to compute.
642
+ - random_state (int): Random state for reproducibility.
643
+ - plot_ (bool): Option to plot metrics (to be implemented if needed).
644
+ - class_weight (str or dict): Class weights to handle imbalance.
645
+
646
+ """
647
+
648
+ # Ensure common features are selected
649
+ common_features = ips.shared(common_features,
650
+ X_train.columns,
651
+ X_true.columns,
652
+ strict=True)
653
+
654
+ # Filter the training and validation datasets for the common features
655
+ X_train_selected = X_train[common_features]
656
+ X_true_selected = X_true[common_features]
657
+
658
+ if not X_true_selected.index.equals(y_true.index):
659
+ raise ValueError("Index mismatch between validation features and target. Ensure data alignment.")
660
+
661
+ y_true= y_true.loc[X_true_selected.index]
662
+
663
+ # Handle class imbalance using SMOTE
664
+ if smote:
665
+ if y_train.value_counts(normalize=True).max() < 0.8: # Threshold to decide if data is imbalanced
666
+ smote = SMOTE(random_state=random_state)
667
+ X_train_resampled, y_train_resampled = smote.fit_resample(
668
+ X_train_selected, y_train
669
+ )
670
+ else:
671
+ # skip SMOTE
672
+ X_train_resampled, y_train_resampled = X_train_selected, y_train
673
+ else:
674
+ X_train_resampled, y_train_resampled = X_train_selected, y_train
675
+
676
+ # Default classifiers if not provided
677
+ if classifiers is None:
678
+ classifiers = {
679
+ "Random Forest": RandomForestClassifier(
680
+ class_weight=class_weight, random_state=random_state
681
+ ),
682
+ "SVM": SVC(probability=True, class_weight=class_weight),
683
+ "Logistic Regression": LogisticRegression(
684
+ class_weight=class_weight, random_state=random_state
685
+ ),
686
+ "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
687
+ "AdaBoost": AdaBoostClassifier(random_state=random_state, algorithm="SAMME"),
688
+ "Lasso": LogisticRegression(penalty='l1', solver='saga', random_state=random_state),
689
+ "Ridge": LogisticRegression(penalty='l2', solver='saga', random_state=random_state),
690
+ "Elastic Net": LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=random_state),
691
+ "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
692
+ "Naive Bayes": GaussianNB(),
693
+ "LDA": LinearDiscriminantAnalysis()
694
+ }
695
+
696
+ # Hyperparameter grids for tuning
697
+ param_grids = {
698
+ "Random Forest": {
699
+ 'n_estimators': [100, 200, 300, 400, 500],
700
+ 'max_depth': [None, 3, 5, 10, 20],
701
+ 'min_samples_split': [2, 5, 10],
702
+ 'min_samples_leaf': [1, 2, 4],
703
+ 'class_weight': [None, 'balanced']
704
+ },
705
+ "SVM": {
706
+ 'C': [0.01, 0.1, 1, 10, 100, 1000],
707
+ 'gamma': [0.001, 0.01, 0.1, 'scale', 'auto'],
708
+ 'kernel': ['linear', 'rbf', 'poly']
709
+ },
710
+ "Logistic Regression": {
711
+ 'C': [0.01, 0.1, 1, 10, 100],
712
+ 'solver': ['liblinear', 'saga', 'newton-cg', 'lbfgs'],
713
+ 'penalty': ['l1', 'l2'],
714
+ 'max_iter': [100, 200, 300]
715
+ },
716
+ "Gradient Boosting": {
717
+ 'n_estimators': [100, 200, 300, 400, 500],
718
+ 'learning_rate': np.logspace(-3, 0, 4),
719
+ 'max_depth': [3, 5, 7, 9],
720
+ 'min_samples_split': [2, 5, 10]
721
+ },
722
+ "AdaBoost": {
723
+ 'n_estimators': [50, 100, 200, 300, 500],
724
+ 'learning_rate': np.logspace(-3, 0, 4)
725
+ },
726
+ "Lasso": {
727
+ 'C': np.logspace(-3, 1, 10),
728
+ 'max_iter': [100, 200, 300]
729
+ },
730
+ "Ridge": {
731
+ 'C': np.logspace(-3, 1, 10),
732
+ 'max_iter': [100, 200, 300]
733
+ },
734
+ "Elastic Net": {
735
+ 'C': np.logspace(-3, 1, 10),
736
+ 'l1_ratio': [0.1, 0.5, 0.9],
737
+ 'max_iter': [100, 200, 300]
738
+ },
739
+ "XGBoost": {
740
+ 'n_estimators': [100, 200],
741
+ 'max_depth': [3, 5, 7],
742
+ 'learning_rate': [0.01, 0.1, 0.2],
743
+ 'subsample': [0.8, 1.0],
744
+ 'colsample_bytree': [0.8, 1.0]
745
+ },
746
+ "Naive Bayes": {},
747
+ "LDA": {
748
+ 'solver': ['svd', 'lsqr', 'eigen']
749
+ }
750
+ }
751
+ # Default metrics if not provided
752
+ if metrics is None:
753
+ metrics = ["accuracy", "precision", "recall", "f1", "roc_auc", "mcc", "specificity", "balanced_accuracy", "pr_auc"]
754
+
755
+ results = {}
756
+
757
+ # Validate each classifier with GridSearchCV
758
+ for name, clf in classifiers.items():
759
+ print(f"\nValidating {name} on the validation dataset:")
760
+
761
+ # Check if `predict_proba` method exists; if not, use CalibratedClassifierCV
762
+ # 没有predict_proba的分类器,使用 CalibratedClassifierCV 可以获得校准的概率估计。此外,为了使代码更灵活,我们可以在创建分类器
763
+ # 时检查 predict_proba 方法是否存在,如果不存在且用户希望计算 roc_auc 或 pr_auc,则启用 CalibratedClassifierCV
764
+ if not hasattr(clf, "predict_proba"):
765
+ print(f"Using CalibratedClassifierCV for {name} due to lack of probability estimates.")
766
+ calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit')
767
+ else:
768
+ calibrated_clf = clf
769
+ # Stratified K-Fold for cross-validation
770
+ skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
771
+
772
+ # Create GridSearchCV object
773
+ gs = GridSearchCV(
774
+ estimator= calibrated_clf,
775
+ param_grid=param_grids[name],
776
+ scoring="roc_auc", # Optimize for ROC AUC
777
+ cv=skf, # Stratified K-Folds cross-validation
778
+ n_jobs=-1,
779
+ verbose=1,
780
+ )
781
+
782
+ # Fit the model using GridSearchCV
783
+ gs.fit(X_train_resampled, y_train_resampled)
784
+ # Best estimator from grid search
785
+ best_clf = gs.best_estimator_
786
+ # Make predictions on the validation set
787
+ y_pred = best_clf.predict(X_true_selected)
788
+ # Calculate probabilities for ROC AUC if possible
789
+ if hasattr(best_clf, "predict_proba"):
790
+ y_pred_proba = best_clf.predict_proba(X_true_selected)[:, 1]
791
+ elif hasattr(best_clf, "decision_function"):
792
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
793
+ y_pred_proba = best_clf.decision_function(X_true_selected)
794
+ # Ensure y_pred_proba is within 0 and 1 bounds
795
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min())
796
+ else:
797
+ y_pred_proba = None # No probability output for certain models
798
+
799
+ # Calculate metrics
800
+ validation_scores = {}
801
+ for metric in metrics:
802
+ if metric == "accuracy":
803
+ validation_scores[metric] = accuracy_score(y_true, y_pred)
804
+ elif metric == "precision":
805
+ validation_scores[metric] = precision_score(y_true, y_pred, average='weighted')
806
+ elif metric == "recall":
807
+ validation_scores[metric] = recall_score(y_true, y_pred, average='weighted')
808
+ elif metric == "f1":
809
+ validation_scores[metric] = f1_score(y_true, y_pred, average='weighted')
810
+ elif metric == "roc_auc" and y_pred_proba is not None:
811
+ validation_scores[metric] = roc_auc_score(y_true, y_pred_proba)
812
+ elif metric == "mcc":
813
+ validation_scores[metric] = matthews_corrcoef(y_true, y_pred)
814
+ elif metric == "specificity":
815
+ tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
816
+ validation_scores[metric] = tn / (tn + fp) # Specificity calculation
817
+ elif metric == "balanced_accuracy":
818
+ validation_scores[metric] = balanced_accuracy_score(y_true, y_pred)
819
+ elif metric == "pr_auc" and y_pred_proba is not None:
820
+ precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
821
+ validation_scores[metric] = average_precision_score(y_true, y_pred_proba)
822
+
823
+ # Calculate ROC curve
824
+ #https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
825
+ if y_pred_proba is not None:
826
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
827
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
828
+ lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba)
829
+ roc_auc=auc(fpr, tpr)
830
+ roc_info={
831
+ "fpr": fpr.tolist(),
832
+ "tpr": tpr.tolist(),
833
+ "auc":roc_auc,
834
+ "ci95":(lower_ci, upper_ci)
835
+ }
836
+ # precision-recall curve
837
+ precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
838
+ avg_precision_ = average_precision_score(y_true, y_pred_proba)
839
+ pr_info = {"precision": precision_,
840
+ "recall":recall_,
841
+ "avg_precision":avg_precision_
842
+ }
843
+ else:
844
+ roc_info,pr_info=None,None
845
+ results[name] = {
846
+ "best_params": gs.best_params_,
847
+ "scores": validation_scores,
848
+ "roc_curve": roc_info,
849
+ "pr_curve": pr_info,
850
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
851
+ }
852
+
853
+ df_results = pd.DataFrame.from_dict(results, orient="index")
854
+
855
+ return df_results
856
+
857
+ #! usage validate_features()
858
+ # Validate classifiers using the validation dataset (X_val, y_val)
859
+ # validation_results = validate_features(X, y, X_val, y_val, common_features)
860
+
861
+ # # If you want to access validation scores
862
+ # print(validation_results)
863
+
864
+
865
+ def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95,random_state=1):
866
+ y_true = np.asarray(y_true)
867
+ y_pred = np.asarray(y_pred)
868
+ bootstrapped_scores = []
869
+ print("auroc score:", roc_auc_score(y_true, y_pred))
870
+ rng = np.random.RandomState(random_state)
871
+ for i in range(n_bootstraps):
872
+ # bootstrap by sampling with replacement on the prediction indices
873
+ indices = rng.randint(0, len(y_pred), len(y_pred))
874
+ if len(np.unique(y_true[indices])) < 2:
875
+ # We need at least one positive and one negative sample for ROC AUC
876
+ # to be defined: reject the sample
877
+ continue
878
+ if isinstance(y_true, np.ndarray):
879
+ score = roc_auc_score(y_true[indices], y_pred[indices])
880
+ else:
881
+ score = roc_auc_score(y_true.iloc[indices], y_pred.iloc[indices])
882
+ bootstrapped_scores.append(score)
883
+ # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
884
+ sorted_scores = np.array(bootstrapped_scores)
885
+ sorted_scores.sort()
886
+
887
+ # Computing the lower and upper bound of the 90% confidence interval
888
+ # You can change the bounds percentiles to 0.025 and 0.975 to get
889
+ # a 95% confidence interval instead.
890
+ confidence_lower = sorted_scores[int((1-ci) * len(sorted_scores))]
891
+ confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
892
+ print(
893
+ "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
894
+ confidence_lower, confidence_upper
895
+ )
896
+ )
897
+ return confidence_lower, confidence_upper
898
+
899
+ def plot_roc_curve(
900
+ fpr=None,
901
+ tpr=None,
902
+ mean_auc=None,
903
+ lower_ci=None,
904
+ upper_ci=None,
905
+ color="#FF8F00",
906
+ lw=2,
907
+ alpha=0.1,
908
+ ci_display=True,
909
+ title="ROC Curve",
910
+ xlabel="1−Specificity",
911
+ ylabel="Sensitivity",
912
+ legend_loc="lower right",
913
+ diagonal_color="0.5",
914
+ figsize=(5, 5),
915
+ ax=None,
916
+ **kwargs
917
+ ):
918
+ if ax is None:
919
+ fig, ax = plt.subplots(figsize=figsize)
920
+ if mean_auc is not None:
921
+ if ci_display:
922
+ label = (
923
+ f"ROC curve (AUC = {mean_auc:.3f})\n95% CI: {lower_ci:.3f} - {upper_ci:.3f}"
924
+ )
925
+ else:
926
+ label = f"ROC curve (AUC = {mean_auc:.3f})"
927
+ else:
928
+ label = None
929
+
930
+ # Plot ROC curve and the diagonal reference line
931
+ ax.fill_between(fpr, tpr, alpha=alpha, color=color)
932
+ ax.plot([0, 1], [0, 1], color=diagonal_color, linestyle="--")
933
+ ax.plot(fpr, tpr, color=color, lw=lw, label=label,**kwargs)
934
+ # Setting plot limits, labels, and title
935
+ ax.set_xlim([-0.01, 1.0])
936
+ ax.set_ylim([0.0, 1.0])
937
+ ax.set_xlabel(xlabel)
938
+ ax.set_ylabel(ylabel)
939
+ ax.set_title(title)
940
+ ax.legend(loc=legend_loc)
941
+ return ax
942
+ #* usage: ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
943
+ # for model_name in flatten(validation_results["roc_curve"].keys())[2:]:
944
+ # fpr = validation_results["roc_curve"][model_name]["fpr"]
945
+ # tpr = validation_results["roc_curve"][model_name]["tpr"]
946
+ # (lower_ci, upper_ci) = validation_results["roc_curve"][model_name]["ci95"]
947
+ # mean_auc = validation_results["roc_curve"][model_name]["auc"]
948
+
949
+ # # Plotting
950
+ # ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
951
+ # figsets(title=model_name)
952
+
953
+ def plot_pr_curve(
954
+ recall=None,
955
+ precision=None,
956
+ avg_precision=None,
957
+ model_name=None,
958
+ lw=2,
959
+ figsize=[5, 5],
960
+ title="Precision-Recall Curve",
961
+ xlabel="Recall",
962
+ ylabel="Precision",
963
+ alpha=0.1,
964
+ color="#FF8F00",
965
+ legend_loc="lower left",
966
+ ax=None,
967
+ **kwargs
968
+ ):
969
+ if ax is None:
970
+ fig, ax = plt.subplots(figsize=figsize)
971
+
972
+ # Plot Precision-Recall curve
973
+ ax.plot(recall,
974
+ precision,
975
+ lw=lw,
976
+ color=color,
977
+ label=( f"PR curve (AUC={avg_precision:.2f})"),
978
+ **kwargs)
979
+ # Fill area under the curve
980
+ ax.fill_between(recall, precision, alpha=alpha, color=color)
981
+
982
+ # Customize axes
983
+ ax.set_title(title)
984
+ ax.set_xlabel(xlabel)
985
+ ax.set_ylabel(ylabel)
986
+ ax.set_xlim([-0.01, 1.0])
987
+ ax.set_ylim([0.0, 1.0])
988
+ ax.grid(False)
989
+ ax.legend(loc=legend_loc)
990
+ return ax
991
+ #* usage: ml2ls.plot_pr_curve()
992
+ # for md_name in flatten(validation_results["pr_curve"].keys()):
993
+ # ml2ls.plot_pr_curve(
994
+ # recall=validation_results["pr_curve"][md_name]["recall"],
995
+ # precision=validation_results["pr_curve"][md_name]["precision"],
996
+ # avg_precision=validation_results["pr_curve"][md_name]["avg_precision"],
997
+ # model_name=md_name,
998
+ # lw=2,
999
+ # alpha=0.1,
1000
+ # color="r",
1001
+ # )
1002
+
1003
+ def plot_cm(
1004
+ cm,
1005
+ labels_name=None,
1006
+ thresh=0.8,
1007
+ axis_labels=None,
1008
+ cmap="Reds",
1009
+ normalize=True,
1010
+ xlabel="Predicted Label",
1011
+ ylabel="Actual Label",
1012
+ fontsize=12,
1013
+ figsize=[5, 5],
1014
+ ax=None,
1015
+ ):
1016
+ if ax is None:
1017
+ fig, ax = plt.subplots(figsize=figsize)
1018
+
1019
+ cm_normalized = np.round(cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] * 100, 2)
1020
+ cm_value = cm_normalized if normalize else cm.astype("int")
1021
+ # Plot the heatmap
1022
+ cax = ax.imshow(cm_normalized, interpolation="nearest", cmap=cmap)
1023
+ plt.colorbar(cax, ax=ax, fraction=0.046, pad=0.04)
1024
+ cax.set_clim(0, 100)
1025
+
1026
+ # Define tick labels based on provided labels
1027
+ num_local = np.arange(len(labels_name)) if labels_name is not None else range(2)
1028
+ if axis_labels is None:
1029
+ axis_labels = labels_name if labels_name is not None else ["No","Yes"]
1030
+ ax.set_xticks(num_local)
1031
+ ax.set_xticklabels(axis_labels)
1032
+ ax.set_yticks(num_local)
1033
+ ax.set_yticklabels(axis_labels)
1034
+ ax.set_ylabel(ylabel)
1035
+ ax.set_xlabel(xlabel)
1036
+ plot.figsets(ax=ax, xtickloc="tl", boxloc="none")
1037
+
1038
+ # Add TN, FP, FN, TP annotations specifically for binary classification (2x2 matrix)
1039
+ if labels_name is None or len(labels_name) == 2:
1040
+ # True Negative (TN), False Positive (FP), False Negative (FN), and True Positive (TP)
1041
+ # Predicted
1042
+ # 0 | 1
1043
+ # ----------------
1044
+ # 0 | TN | FP
1045
+ # Actual ----------------
1046
+ # 1 | FN | TP
1047
+ tn_label = "TN"
1048
+ fp_label = "FP"
1049
+ fn_label = "FN"
1050
+ tp_label = "TP"
1051
+
1052
+ # Adjust positions slightly for TN, FP, FN, TP labels
1053
+ ax.text(0,0,
1054
+ f"{tn_label}:{cm_normalized[0, 0]:.2f}%" if normalize else f"{tn_label}:{cm_value[0, 0]}",
1055
+ ha="center",
1056
+ va="center",
1057
+ color="white" if cm_normalized[0, 0] > thresh * 100 else "black",
1058
+ fontsize=fontsize,
1059
+ )
1060
+ ax.text(1,0,
1061
+ f"{fp_label}:{cm_normalized[0, 1]:.2f}%" if normalize else f"{tn_label}:{cm_value[0, 1]}",
1062
+ ha="center",
1063
+ va="center",
1064
+ color="white" if cm_normalized[0, 1] > thresh * 100 else "black",
1065
+ fontsize=fontsize,
1066
+ )
1067
+ ax.text(0,1,
1068
+ f"{fn_label}:{cm_normalized[1, 0]:.2f}%" if normalize else f"{tn_label}:{cm_value[1, 0]}",
1069
+ ha="center",
1070
+ va="center",
1071
+ color="white" if cm_normalized[1, 0] > thresh * 100 else "black",
1072
+ fontsize=fontsize,
1073
+ )
1074
+ ax.text(1,1,
1075
+ f"{tp_label}:{cm_normalized[1, 1]:.2f}%" if normalize else f"{tn_label}:{cm_value[1, 1]}",
1076
+ ha="center",
1077
+ va="center",
1078
+ color="white" if cm_normalized[1, 1] > thresh * 100 else "black",
1079
+ fontsize=fontsize,
1080
+ )
1081
+ else:
1082
+ # Annotate cells with normalized percentage values
1083
+ for i in range(len(labels_name)):
1084
+ for j in range(len(labels_name)):
1085
+ val = cm_normalized[i, j]
1086
+ color = "white" if val > thresh * 100 else "black"
1087
+ ax.text(j,i,
1088
+ f"{val:.2f}%",
1089
+ ha="center",
1090
+ va="center",
1091
+ color=color,
1092
+ fontsize=fontsize,
1093
+ )
1094
+ return ax