py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ips.py +213 -195
- py2ls/ml2ls.py +774 -66
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
|
|
5
5
|
BaggingClassifier,
|
6
6
|
)
|
7
7
|
from sklearn.svm import SVC, SVR
|
8
|
-
from sklearn.calibration import CalibratedClassifierCV
|
9
8
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
10
9
|
from sklearn.linear_model import (
|
11
10
|
LassoCV,
|
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
|
|
16
15
|
RidgeClassifierCV,
|
17
16
|
ElasticNet,
|
18
17
|
)
|
19
|
-
|
20
|
-
from sklearn.naive_bayes import GaussianNB
|
21
|
-
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
22
|
-
import xgboost as xgb # Make sure you have xgboost installed
|
23
|
-
|
24
|
-
from sklearn.model_selection import train_test_split, cross_val_score
|
18
|
+
|
25
19
|
from sklearn.metrics import (
|
26
20
|
accuracy_score,
|
27
21
|
precision_score,
|
@@ -36,18 +30,12 @@ from sklearn.metrics import (
|
|
36
30
|
precision_recall_curve,
|
37
31
|
average_precision_score,
|
38
32
|
)
|
39
|
-
from imblearn.over_sampling import SMOTE
|
40
|
-
from sklearn.pipeline import Pipeline
|
41
|
-
from collections import defaultdict
|
42
|
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
43
33
|
from typing import Dict, Any, Optional, List, Union
|
44
34
|
import numpy as np
|
45
35
|
import pandas as pd
|
46
36
|
from . import ips
|
47
37
|
from . import plot
|
48
38
|
import matplotlib.pyplot as plt
|
49
|
-
import seaborn as sns
|
50
|
-
|
51
39
|
plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
|
52
40
|
import logging
|
53
41
|
import warnings
|
@@ -314,6 +302,8 @@ def features_svm(
|
|
314
302
|
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
315
303
|
S-shaped relationships.
|
316
304
|
"""
|
305
|
+
from sklearn.feature_selection import RFE
|
306
|
+
from sklearn.svm import SVC
|
317
307
|
# SVM (Support Vector Machines)
|
318
308
|
svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
|
319
309
|
# RFE(Recursive Feature Elimination)
|
@@ -450,6 +440,7 @@ def validate_classifier(
|
|
450
440
|
Returns:
|
451
441
|
- results: Dictionary containing average cv_train_scores and cv_test_scores.
|
452
442
|
"""
|
443
|
+
from sklearn.model_selection import cross_val_score
|
453
444
|
cv_train_scores = {metric: [] for metric in metrics}
|
454
445
|
skf = StratifiedKFold(n_splits=cv_folds)
|
455
446
|
# Perform cross-validation
|
@@ -982,6 +973,8 @@ def validate_features(
|
|
982
973
|
|
983
974
|
"""
|
984
975
|
from tqdm import tqdm
|
976
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
977
|
+
from sklearn.calibration import CalibratedClassifierCV
|
985
978
|
|
986
979
|
# Ensure common features are selected
|
987
980
|
common_features = ips.shared(
|
@@ -1001,6 +994,7 @@ def validate_features(
|
|
1001
994
|
|
1002
995
|
# Handle class imbalance using SMOTE
|
1003
996
|
if smote:
|
997
|
+
from imblearn.over_sampling import SMOTE
|
1004
998
|
if (
|
1005
999
|
y_train.value_counts(normalize=True).max() < 0.8
|
1006
1000
|
): # Threshold to decide if data is imbalanced
|
@@ -2096,7 +2090,116 @@ def rank_models(
|
|
2096
2090
|
# )
|
2097
2091
|
|
2098
2092
|
# figsave("classifier_performance.pdf")
|
2093
|
+
def rank_models_reg(df, ascending=False):
|
2094
|
+
"""
|
2095
|
+
Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
|
2099
2096
|
|
2097
|
+
Parameters:
|
2098
|
+
df (pd.DataFrame): DataFrame containing the regression metrics.
|
2099
|
+
ascending (bool): Whether to sort in ascending order of ranking score.
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
|
2103
|
+
"""
|
2104
|
+
# Define weights for the 4 metrics
|
2105
|
+
weights = {
|
2106
|
+
"mse": -1, # Lower is better
|
2107
|
+
"rmse": -1, # Lower is better
|
2108
|
+
"mae": -1, # Lower is better
|
2109
|
+
"r2": 1, # Higher is better
|
2110
|
+
}
|
2111
|
+
|
2112
|
+
# Normalize the selected metrics
|
2113
|
+
df = df.copy() # Work on a copy of the DataFrame
|
2114
|
+
for metric, weight in weights.items():
|
2115
|
+
if metric in df.columns:
|
2116
|
+
if weight > 0: # Higher is better; normalize 0-1
|
2117
|
+
df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
|
2118
|
+
df[metric].max() - df[metric].min()
|
2119
|
+
)
|
2120
|
+
else: # Lower is better; reverse normalize 0-1
|
2121
|
+
df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
|
2122
|
+
df[metric].max() - df[metric].min()
|
2123
|
+
)
|
2124
|
+
|
2125
|
+
# Calculate ranking score as a weighted sum
|
2126
|
+
df["Ranking_Score"] = sum(
|
2127
|
+
df[metric + "_normalized"] * abs(weights[metric])
|
2128
|
+
for metric in weights.keys()
|
2129
|
+
if metric + "_normalized" in df.columns
|
2130
|
+
)
|
2131
|
+
|
2132
|
+
# Sort models based on the ranking score
|
2133
|
+
sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
|
2134
|
+
return sorted_df
|
2135
|
+
|
2136
|
+
models_support = {
|
2137
|
+
"classification": {
|
2138
|
+
"Random Forest": "Tree-Based",
|
2139
|
+
"SVM": "Kernel-Based",
|
2140
|
+
"Logistic Regression": "Linear",
|
2141
|
+
"Lasso Logistic Regression": "Linear",
|
2142
|
+
"Gradient Boosting": "Tree-Based",
|
2143
|
+
"XGBoost": "Tree-Based",
|
2144
|
+
"KNN": "Instance-Based",
|
2145
|
+
"Naive Bayes": "Probabilistic",
|
2146
|
+
"Linear Discriminant Analysis": "Linear",
|
2147
|
+
"AdaBoost": "Tree-Based",
|
2148
|
+
"CatBoost": "Tree-Based",
|
2149
|
+
"Extra Trees": "Tree-Based",
|
2150
|
+
"Bagging": "Tree-Based",
|
2151
|
+
"Neural Network": "Neural Network",
|
2152
|
+
"DecisionTree": "Tree-Based",
|
2153
|
+
"Quadratic Discriminant Analysis": "Probabilistic",
|
2154
|
+
"Ridge": "Linear",
|
2155
|
+
"Perceptron": "Linear",
|
2156
|
+
"Bernoulli Naive Bayes": "Probabilistic",
|
2157
|
+
"SGDClassifier": "Linear",
|
2158
|
+
},
|
2159
|
+
"regression": {
|
2160
|
+
"Linear Regression": "Linear",
|
2161
|
+
"Ridge": "Linear",
|
2162
|
+
"RidgeCV": "Linear",
|
2163
|
+
"TheilSenRegressor": "Linear",
|
2164
|
+
"HuberRegressor": "Linear",
|
2165
|
+
"PoissonRegressor": "Linear",
|
2166
|
+
"LassoCV": "Linear",
|
2167
|
+
"Bagging": "Tree-Based",
|
2168
|
+
"ElasticNet": "Linear",
|
2169
|
+
"Random Forest": "Tree-Based",
|
2170
|
+
"Gradient Boosting": "Tree-Based",
|
2171
|
+
"XGBoost": "Tree-Based",
|
2172
|
+
"CatBoost": "Tree-Based",
|
2173
|
+
"Extra Trees": "Tree-Based",
|
2174
|
+
"SVM": "Kernel-Based",
|
2175
|
+
"KNN": "Instance-Based",
|
2176
|
+
"Neural Network": "Neural Network",
|
2177
|
+
"AdaBoost": "Linear",
|
2178
|
+
},
|
2179
|
+
}
|
2180
|
+
def select_top_models(models, categories, n_top_models, n_models_per_category=1):
|
2181
|
+
"""
|
2182
|
+
models = list_sort
|
2183
|
+
purpose = "regression"
|
2184
|
+
categories = models_support[purpose]
|
2185
|
+
n_top_models = 3
|
2186
|
+
select_top_models(models, categories, n_top_models)
|
2187
|
+
"""
|
2188
|
+
selected = {}
|
2189
|
+
result = []
|
2190
|
+
for model in models:
|
2191
|
+
category = categories.get(model, "Unknown")
|
2192
|
+
if category not in selected:
|
2193
|
+
selected[category] = 0 # Initialize counter for the category
|
2194
|
+
|
2195
|
+
if selected[category] < n_models_per_category: # Allow additional models up to the limit
|
2196
|
+
selected[category] += 1
|
2197
|
+
result.append(model)
|
2198
|
+
|
2199
|
+
if len(result) == n_top_models: # Stop when the desired number of models is reached
|
2200
|
+
break
|
2201
|
+
|
2202
|
+
return result
|
2100
2203
|
|
2101
2204
|
def predict(
|
2102
2205
|
x_train: pd.DataFrame,
|
@@ -2104,11 +2207,17 @@ def predict(
|
|
2104
2207
|
x_true: pd.DataFrame = None,
|
2105
2208
|
y_true: Optional[pd.Series] = None,
|
2106
2209
|
backward: bool = False, # backward_regression
|
2210
|
+
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2107
2211
|
common_features: set = None,
|
2108
2212
|
purpose: str = "classification", # 'classification' or 'regression'
|
2109
2213
|
cls: Optional[Dict[str, Any]] = None,
|
2110
2214
|
metrics: Optional[List[str]] = None,
|
2111
|
-
|
2215
|
+
stack:bool=True,# run stacking
|
2216
|
+
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2217
|
+
vote:bool=True,# run voting
|
2218
|
+
voting:str="hard", # only for classification purporse of voting
|
2219
|
+
n_top_models:int=5, #for stacking models
|
2220
|
+
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
2112
2221
|
smote: bool = False,
|
2113
2222
|
n_jobs: int = -1,
|
2114
2223
|
plot_: bool = True,
|
@@ -2117,6 +2226,7 @@ def predict(
|
|
2117
2226
|
cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
|
2118
2227
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2119
2228
|
class_weight: str = "balanced",
|
2229
|
+
random_state: int = 1,
|
2120
2230
|
verbose: bool = False,
|
2121
2231
|
) -> pd.DataFrame:
|
2122
2232
|
"""
|
@@ -2184,10 +2294,17 @@ def predict(
|
|
2184
2294
|
RidgeClassifierCV,
|
2185
2295
|
Perceptron,
|
2186
2296
|
SGDClassifier,
|
2297
|
+
RidgeCV,
|
2298
|
+
Ridge,
|
2299
|
+
TheilSenRegressor,
|
2300
|
+
HuberRegressor,
|
2301
|
+
PoissonRegressor,
|
2302
|
+
|
2187
2303
|
)
|
2304
|
+
from sklearn.compose import TransformedTargetRegressor
|
2188
2305
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
2189
2306
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
2190
|
-
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
2307
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
|
2191
2308
|
import xgboost as xgb
|
2192
2309
|
import lightgbm as lgb
|
2193
2310
|
import catboost as cb
|
@@ -2198,6 +2315,7 @@ def predict(
|
|
2198
2315
|
QuadraticDiscriminantAnalysis,
|
2199
2316
|
)
|
2200
2317
|
from sklearn.preprocessing import PolynomialFeatures
|
2318
|
+
from sklearn.model_selection import train_test_split
|
2201
2319
|
|
2202
2320
|
# 拼写检查
|
2203
2321
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
@@ -2261,7 +2379,6 @@ def predict(
|
|
2261
2379
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2262
2380
|
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2263
2381
|
"Linear Regression": LinearRegression(),
|
2264
|
-
"Lasso": Lasso(random_state=random_state),
|
2265
2382
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2266
2383
|
# "LightGBM": lgb.LGBMRegressor(random_state=random_state),
|
2267
2384
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
@@ -2271,10 +2388,10 @@ def predict(
|
|
2271
2388
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2272
2389
|
"Ridge": Ridge(),
|
2273
2390
|
"KNN": KNeighborsRegressor(),
|
2391
|
+
"TheilSen":TheilSenRegressor(),
|
2392
|
+
"Huber":HuberRegressor(),
|
2393
|
+
"Poisson":PoissonRegressor()
|
2274
2394
|
}
|
2275
|
-
# indicate cls:
|
2276
|
-
if ips.run_once_within(30): # 10 min
|
2277
|
-
print(f"supported models: {list(model_.keys())}")
|
2278
2395
|
if cls is None:
|
2279
2396
|
models = model_
|
2280
2397
|
else:
|
@@ -2290,6 +2407,10 @@ def predict(
|
|
2290
2407
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2291
2408
|
)
|
2292
2409
|
|
2410
|
+
# indicate cls:
|
2411
|
+
if ips.run_once_within(30): # 10 min
|
2412
|
+
print(f"processing: {list(models.keys())}")
|
2413
|
+
|
2293
2414
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2294
2415
|
y_train_col_name = y_train
|
2295
2416
|
y_train = x_train[y_train]
|
@@ -2311,7 +2432,7 @@ def predict(
|
|
2311
2432
|
|
2312
2433
|
# Perform backward feature selection
|
2313
2434
|
if backward:
|
2314
|
-
selected_features = backward_regression(x_train, y_train,
|
2435
|
+
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
2315
2436
|
x_train = x_train[selected_features]
|
2316
2437
|
|
2317
2438
|
if x_true is None:
|
@@ -2391,10 +2512,22 @@ def predict(
|
|
2391
2512
|
if isinstance(y_train, np.ndarray):
|
2392
2513
|
y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
|
2393
2514
|
y_train = np.asarray(y_train)
|
2394
|
-
if
|
2395
|
-
|
2396
|
-
|
2515
|
+
if y_true is not None:
|
2516
|
+
if isinstance(y_train, np.ndarray):
|
2517
|
+
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2518
|
+
y_true = np.asarray(y_true)
|
2397
2519
|
# Hyperparameter grids for tuning
|
2520
|
+
param_grid_common_xgb = {
|
2521
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2522
|
+
'max_depth': [3, 5, 7, 10],
|
2523
|
+
'n_estimators': [50, 100, 200, 300],
|
2524
|
+
'subsample': [0.6, 0.8, 1.0],
|
2525
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2526
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2527
|
+
'min_child_weight': [1, 5, 10],
|
2528
|
+
'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
|
2529
|
+
'reg_lambda': [1, 1.5, 2], # L2 regularization term
|
2530
|
+
}
|
2398
2531
|
if cv_level in ["low", "simple", "s", "l"]:
|
2399
2532
|
param_grids = {
|
2400
2533
|
"Random Forest": (
|
@@ -2416,8 +2549,8 @@ def predict(
|
|
2416
2549
|
}
|
2417
2550
|
),
|
2418
2551
|
"SVM": {
|
2419
|
-
"C": [1],
|
2420
|
-
"gamma": ["scale"],
|
2552
|
+
"C": [0.1, 1, 10],
|
2553
|
+
"gamma": ["scale", 0.1, 1],
|
2421
2554
|
"kernel": ["rbf"],
|
2422
2555
|
},
|
2423
2556
|
"Lasso": {
|
@@ -2439,12 +2572,17 @@ def predict(
|
|
2439
2572
|
"min_samples_split": [2],
|
2440
2573
|
"subsample": [0.8],
|
2441
2574
|
},
|
2442
|
-
"XGBoost":
|
2443
|
-
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2447
|
-
|
2575
|
+
"XGBoost":{
|
2576
|
+
'learning_rate': [0.01],
|
2577
|
+
'max_depth': [3],
|
2578
|
+
'n_estimators': [50],
|
2579
|
+
'subsample': [0.6],
|
2580
|
+
'colsample_bytree': [0.6],
|
2581
|
+
'gamma': [0, 0.1],
|
2582
|
+
'min_child_weight': [1],
|
2583
|
+
'reg_alpha': [0, 0.1],
|
2584
|
+
'reg_lambda': [1],
|
2585
|
+
'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
|
2448
2586
|
},
|
2449
2587
|
"KNN": (
|
2450
2588
|
{
|
@@ -2551,6 +2689,14 @@ def predict(
|
|
2551
2689
|
"random_state": [random_state],
|
2552
2690
|
"learning_rate": ["constant"],
|
2553
2691
|
},
|
2692
|
+
"TheilSen":{'max_iter': [100],
|
2693
|
+
'tol': [1e-4],
|
2694
|
+
'n_subsamples': [100+x_train.shape[1]]},
|
2695
|
+
"Huber":{'epsilon': [1.35],
|
2696
|
+
'alpha': [0.1],
|
2697
|
+
'max_iter': [100],},
|
2698
|
+
"Poisson":{'alpha': [0.1],
|
2699
|
+
'max_iter': [100],}
|
2554
2700
|
}
|
2555
2701
|
elif cv_level in ["high", "advanced", "h"]:
|
2556
2702
|
param_grids = {
|
@@ -2612,12 +2758,30 @@ def predict(
|
|
2612
2758
|
"subsample": [0.8, 1.0],
|
2613
2759
|
},
|
2614
2760
|
"XGBoost": {
|
2615
|
-
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2761
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2762
|
+
'max_depth': [3, 5, 7, 10],
|
2763
|
+
'n_estimators': [50, 100, 200, 300],
|
2764
|
+
'subsample': [0.6, 0.8, 1.0],
|
2765
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2766
|
+
'min_child_weight': [1, 5, 10],
|
2767
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2768
|
+
'reg_lambda': [1, 1.5, 2],
|
2769
|
+
**{
|
2770
|
+
'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
|
2771
|
+
}} if purpose== "classification"
|
2772
|
+
else{
|
2773
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2774
|
+
'max_depth': [3, 5, 7, 10],
|
2775
|
+
'n_estimators': [50, 100, 200, 300],
|
2776
|
+
'subsample': [0.6, 0.8, 1.0],
|
2777
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2778
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2779
|
+
'min_child_weight': [1, 5, 10],
|
2780
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2781
|
+
'reg_lambda': [1, 1.5, 2],
|
2782
|
+
**{
|
2783
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
|
2784
|
+
}},
|
2621
2785
|
"KNN": (
|
2622
2786
|
{
|
2623
2787
|
"n_neighbors": [1, 3, 5, 10, 15, 20],
|
@@ -2730,6 +2894,14 @@ def predict(
|
|
2730
2894
|
], # If True, the regressors X will be normalized
|
2731
2895
|
}
|
2732
2896
|
),
|
2897
|
+
"TheilSen":{'max_iter': [100, 200, 300],
|
2898
|
+
'tol': [1e-4, 1e-3, 1e-2],
|
2899
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
|
2900
|
+
"Huber":{'epsilon': [1.35, 1.5, 2.0],
|
2901
|
+
'alpha': [0.1, 1.0, 10.0],
|
2902
|
+
'max_iter': [100, 200, 300],},
|
2903
|
+
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2904
|
+
'max_iter': [100, 200, 300],}
|
2733
2905
|
}
|
2734
2906
|
else: # median level
|
2735
2907
|
param_grids = {
|
@@ -2789,12 +2961,30 @@ def predict(
|
|
2789
2961
|
"subsample": [0.8, 1.0],
|
2790
2962
|
},
|
2791
2963
|
"XGBoost": {
|
2792
|
-
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
|
2797
|
-
|
2964
|
+
'learning_rate': [0.01, 0.1],
|
2965
|
+
'max_depth': [3, 5],
|
2966
|
+
'n_estimators': [50, 100],
|
2967
|
+
'subsample': [0.6, 0.8],
|
2968
|
+
'gamma': [0, 0.1],
|
2969
|
+
'min_child_weight': [1, 5],
|
2970
|
+
'reg_alpha': [0, 0.1],
|
2971
|
+
'reg_lambda': [1,],
|
2972
|
+
**{
|
2973
|
+
'objective': ['binary:logistic', 'multi:softmax'],
|
2974
|
+
}} if purpose== "classification"
|
2975
|
+
else{
|
2976
|
+
'learning_rate': [0.01, 0.1],
|
2977
|
+
'max_depth': [3, 5,],
|
2978
|
+
'n_estimators': [50, 100],
|
2979
|
+
'subsample': [0.6, 0.8],
|
2980
|
+
'colsample_bytree': [0.6, 0.8],
|
2981
|
+
'gamma': [0, 0.1],
|
2982
|
+
'min_child_weight': [1, 5],
|
2983
|
+
'reg_alpha': [0, 0.1],
|
2984
|
+
'reg_lambda': [1, 1.5],
|
2985
|
+
**{
|
2986
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror'],
|
2987
|
+
}},
|
2798
2988
|
"KNN": (
|
2799
2989
|
{
|
2800
2990
|
"n_neighbors": [3, 5, 7, 10],
|
@@ -2951,6 +3141,14 @@ def predict(
|
|
2951
3141
|
], # Solver for optimization
|
2952
3142
|
}
|
2953
3143
|
),
|
3144
|
+
"TheilSen":{'max_iter': [100, 200],
|
3145
|
+
'tol': [1e-4, 1e-3],
|
3146
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
|
3147
|
+
"Huber":{'epsilon': [1.35, 1.5],
|
3148
|
+
'alpha': [0.1, 1.0],
|
3149
|
+
'max_iter': [100, 200],},
|
3150
|
+
"Poisson":{'alpha': [0.1, 1.0],
|
3151
|
+
'max_iter': [100, 200],}
|
2954
3152
|
}
|
2955
3153
|
|
2956
3154
|
results = {}
|
@@ -3191,12 +3389,18 @@ def predict(
|
|
3191
3389
|
# Convert results to DataFrame
|
3192
3390
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3193
3391
|
# sort
|
3194
|
-
if y_true is not None
|
3195
|
-
|
3196
|
-
|
3197
|
-
|
3392
|
+
if y_true is not None:
|
3393
|
+
if purpose == "classification":
|
3394
|
+
df_scores = pd.DataFrame(
|
3395
|
+
df_results["scores"].tolist(), index=df_results["scores"].index
|
3396
|
+
).sort_values(by="roc_auc", ascending=False)
|
3397
|
+
elif purpose=='regression':
|
3398
|
+
df_scores = rank_models_reg(
|
3399
|
+
pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
|
3400
|
+
ascending=False)
|
3198
3401
|
df_results = df_results.loc[df_scores.index]
|
3199
3402
|
|
3403
|
+
if y_true is not None and purpose == "classification":
|
3200
3404
|
if plot_:
|
3201
3405
|
from datetime import datetime
|
3202
3406
|
|
@@ -3214,18 +3418,503 @@ def predict(
|
|
3214
3418
|
plot.figsets(xangle=30)
|
3215
3419
|
if dir_save:
|
3216
3420
|
ips.figsave(dir_save + f"scores_clus{now_}.pdf")
|
3421
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3422
|
+
# # try:
|
3423
|
+
# if len(models) > 3:
|
3424
|
+
# plot_validate_features(df_results, is_binary=is_binary)
|
3425
|
+
# else:
|
3426
|
+
# plot_validate_features_single(df_results, is_binary=is_binary)
|
3427
|
+
# if dir_save:
|
3428
|
+
# ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3429
|
+
# # except Exception as e:
|
3430
|
+
# # print(f"Error: 在画图的过程中出现了问题:{e}")
|
3431
|
+
if stack:
|
3432
|
+
#! stacking classifier/regressor
|
3433
|
+
from sklearn.metrics import make_scorer, accuracy_score
|
3434
|
+
from sklearn.model_selection import cross_val_score
|
3435
|
+
|
3436
|
+
#* n_top_models防止超过index
|
3437
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3438
|
+
|
3439
|
+
#* 选择出排名靠前的n个, estimators
|
3440
|
+
models_selecte = select_top_models(models=list(df_results.index),
|
3441
|
+
categories=models_support[purpose],
|
3442
|
+
n_top_models=n_top_models,
|
3443
|
+
n_models_per_category=n_models_per_category)
|
3444
|
+
top_models = df_results.loc[models_selecte]["best_clf"]
|
3445
|
+
base_estimators = []
|
3446
|
+
for i, j in top_models.to_dict().items():
|
3447
|
+
base_estimators.append((i, j))
|
3448
|
+
if stacking_cv:
|
3449
|
+
print(f" ⤵ stacking_cv is processing...")
|
3450
|
+
#* 定义几个象征性的final_estimator
|
3451
|
+
# 备选的几种
|
3452
|
+
if purpose == "classification":
|
3453
|
+
kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
|
3454
|
+
else:
|
3455
|
+
kadt_estimators=["XGBoost","LassoCV"]
|
3456
|
+
final_estimators={}
|
3457
|
+
for name in kadt_estimators:
|
3458
|
+
param_grid=param_grids.get(name, {})
|
3459
|
+
print(param_grid)
|
3460
|
+
if is_binary:
|
3461
|
+
gs = GridSearchCV(
|
3462
|
+
model_[name],
|
3463
|
+
param_grid=param_grid,
|
3464
|
+
scoring=(
|
3465
|
+
"roc_auc"
|
3466
|
+
if purpose == "classification"
|
3467
|
+
else "neg_mean_squared_error"
|
3468
|
+
),
|
3469
|
+
cv=cv,
|
3470
|
+
n_jobs=n_jobs,
|
3471
|
+
verbose=verbose,
|
3472
|
+
)
|
3473
|
+
else:
|
3474
|
+
gs = GridSearchCV(
|
3475
|
+
model_[name],
|
3476
|
+
param_grid=param_grid,
|
3477
|
+
scoring=(
|
3478
|
+
"roc_auc_ovr"
|
3479
|
+
if purpose == "classification"
|
3480
|
+
else "neg_mean_squared_error"
|
3481
|
+
),
|
3482
|
+
cv=cv,
|
3483
|
+
n_jobs=n_jobs,
|
3484
|
+
verbose=verbose,
|
3485
|
+
)
|
3486
|
+
# Fit GridSearchCV
|
3487
|
+
gs.fit(x_train, y_train)
|
3488
|
+
final_estimators[name]=gs.best_estimator_
|
3489
|
+
|
3490
|
+
#* Set up cross-validation and performance evaluation
|
3491
|
+
scorer = make_scorer(accuracy_score)
|
3492
|
+
cv_results = []
|
3493
|
+
|
3494
|
+
#*Cross-validate stacking models with different final estimators
|
3495
|
+
for final_name, final_estimator in final_estimators.items():
|
3496
|
+
print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
|
3497
|
+
if purpose == "classification":
|
3498
|
+
stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
|
3499
|
+
else:
|
3500
|
+
stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
|
3501
|
+
|
3502
|
+
scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
|
3503
|
+
|
3504
|
+
# Store the result
|
3505
|
+
cv_results.append({
|
3506
|
+
'final_estimator':final_estimator,
|
3507
|
+
'Final Estimator': final_name,
|
3508
|
+
'Mean Accuracy': np.mean(scores),
|
3509
|
+
'Standard Deviation': np.std(scores)
|
3510
|
+
})
|
3511
|
+
|
3512
|
+
#* Convert the results into a DataFrame for easy comparison
|
3513
|
+
cv_results_df = pd.DataFrame(cv_results)
|
3514
|
+
|
3515
|
+
#* Sort and display the best model
|
3516
|
+
cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
|
3517
|
+
|
3518
|
+
|
3519
|
+
# Optionally: Select the final estimator that gives the best performance
|
3520
|
+
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
|
+
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
|
+
else:
|
3523
|
+
print(f" ⤵ trying to find the best_final_estimator for stacking...")
|
3524
|
+
if purpose=="classification":
|
3525
|
+
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
|
+
random_state=random_state,
|
3527
|
+
max_iter=1000)
|
3528
|
+
else:
|
3529
|
+
best_final_estimator = RidgeCV(cv=5)
|
3530
|
+
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
|
+
#! apply stacking
|
3532
|
+
if purpose == "classification":
|
3533
|
+
print(f" ⤵ StackingClassifier...")
|
3534
|
+
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
|
+
final_estimator=best_final_estimator,
|
3536
|
+
cv=cv)
|
3537
|
+
else:
|
3538
|
+
print(f" ⤵ StackingRegressor...")
|
3539
|
+
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
|
+
final_estimator=best_final_estimator,
|
3541
|
+
cv=cv)
|
3542
|
+
|
3543
|
+
# Train the Stacking Classifier
|
3544
|
+
print(f" ⤵ fit & predict...")
|
3545
|
+
stacking_model.fit(x_train, y_train)
|
3546
|
+
y_pred_final = stacking_model.predict(x_true)
|
3547
|
+
print(f" ⤵ collecting results...")
|
3548
|
+
# pred_proba
|
3549
|
+
if is_binary:
|
3550
|
+
if hasattr(stacking_model, "predict_proba"):
|
3551
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
+
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
|
+
if y_pred_proba_final.shape[1] == 1:
|
3554
|
+
y_pred_proba_final = np.hstack(
|
3555
|
+
[1 - y_pred_proba_final, y_pred_proba_final]
|
3556
|
+
) # Add missing class probabilities
|
3557
|
+
y_pred_proba_final = y_pred_proba_final[:, 1]
|
3558
|
+
elif hasattr(stacking_model, "decision_function"):
|
3559
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3560
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3561
|
+
# Ensure y_pred_proba_final is within 0 and 1 bounds
|
3562
|
+
y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
|
3563
|
+
y_pred_proba_final.max() - y_pred_proba_final.min()
|
3564
|
+
)
|
3565
|
+
else:
|
3566
|
+
y_pred_proba_final = None # No probability output for certain models
|
3567
|
+
if not is_binary:
|
3568
|
+
# Handle prediction probabilities for multiclass
|
3569
|
+
if hasattr(stacking_model, "predict_proba"):
|
3570
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3571
|
+
elif hasattr(stacking_model, "decision_function"):
|
3572
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3573
|
+
|
3574
|
+
# Normalize for multiclass if necessary
|
3575
|
+
if y_pred_proba_final.ndim == 2:
|
3576
|
+
y_pred_proba_final = (
|
3577
|
+
y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
|
3578
|
+
) / (
|
3579
|
+
y_pred_proba_final.max(axis=1, keepdims=True)
|
3580
|
+
- y_pred_proba_final.min(axis=1, keepdims=True)
|
3581
|
+
)
|
3582
|
+
else:
|
3583
|
+
y_pred_proba_final = None # No probability output for certain models
|
3584
|
+
#! dict_pred_stack
|
3585
|
+
dict_pred_stack={}
|
3586
|
+
validation_scores_final = {}
|
3587
|
+
if y_true is not None and y_pred_proba_final is not None:
|
3588
|
+
validation_scores_final = cal_metrics(
|
3589
|
+
y_true,
|
3590
|
+
y_pred_final,
|
3591
|
+
y_pred_proba=y_pred_proba_final,
|
3592
|
+
is_binary=is_binary,
|
3593
|
+
purpose=purpose,
|
3594
|
+
average="weighted",
|
3595
|
+
)
|
3596
|
+
if is_binary:
|
3597
|
+
# Calculate ROC curve
|
3598
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
3599
|
+
if y_pred_proba_final is not None:
|
3600
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3601
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3602
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3603
|
+
)
|
3604
|
+
roc_auc = auc(fpr, tpr)
|
3605
|
+
roc_info = {
|
3606
|
+
"fpr": fpr.tolist(),
|
3607
|
+
"tpr": tpr.tolist(),
|
3608
|
+
"auc": roc_auc,
|
3609
|
+
"ci95": (lower_ci, upper_ci),
|
3610
|
+
}
|
3611
|
+
# precision-recall curve
|
3612
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
|
3613
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
|
3614
|
+
pr_info = {
|
3615
|
+
"precision": precision_,
|
3616
|
+
"recall": recall_,
|
3617
|
+
"avg_precision": avg_precision_,
|
3618
|
+
}
|
3619
|
+
else:
|
3620
|
+
roc_info, pr_info = None, None
|
3621
|
+
if purpose == "classification":
|
3622
|
+
dict_pred_stack = {
|
3623
|
+
"best_clf": stacking_model,
|
3624
|
+
"best_params": None,
|
3625
|
+
"auc_indiv": None,
|
3626
|
+
"scores": validation_scores_final,
|
3627
|
+
"roc_curve": roc_info,
|
3628
|
+
"pr_curve": pr_info,
|
3629
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3630
|
+
"predictions": y_pred_final.tolist(),
|
3631
|
+
"predictions_proba": (
|
3632
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
|
+
),
|
3634
|
+
}
|
3635
|
+
else: # "regression"
|
3636
|
+
dict_pred_stack = {
|
3637
|
+
"best_clf": stacking_model,
|
3638
|
+
"best_params": None,
|
3639
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3640
|
+
"predictions": y_pred_final.tolist(),
|
3641
|
+
"predictions_proba": (
|
3642
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
|
+
),
|
3644
|
+
}
|
3645
|
+
else: # multi-classes
|
3646
|
+
if y_pred_proba_final is not None:
|
3647
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3648
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3649
|
+
confidence_intervals = cal_auc_ci(
|
3650
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3651
|
+
)
|
3652
|
+
roc_info = {
|
3653
|
+
"fpr": validation_scores_final["fpr"],
|
3654
|
+
"tpr": validation_scores_final["tpr"],
|
3655
|
+
"auc": validation_scores_final["roc_auc_by_class"],
|
3656
|
+
"ci95": confidence_intervals,
|
3657
|
+
}
|
3658
|
+
# precision-recall curve
|
3659
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3660
|
+
y_true, y_pred_proba_final, is_binary=is_binary
|
3661
|
+
)
|
3662
|
+
pr_info = {
|
3663
|
+
"precision": precision_,
|
3664
|
+
"recall": recall_,
|
3665
|
+
"avg_precision": avg_precision_,
|
3666
|
+
}
|
3667
|
+
else:
|
3668
|
+
roc_info, pr_info = None, None
|
3669
|
+
|
3670
|
+
if purpose == "classification":
|
3671
|
+
dict_pred_stack = {
|
3672
|
+
"best_clf": stacking_model,
|
3673
|
+
"best_params": None,
|
3674
|
+
"auc_indiv": None,
|
3675
|
+
"scores": validation_scores_final,
|
3676
|
+
"roc_curve": roc_info,
|
3677
|
+
"pr_curve": pr_info,
|
3678
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3679
|
+
"predictions": y_pred_final.tolist(),
|
3680
|
+
"predictions_proba": (
|
3681
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
|
+
),
|
3683
|
+
}
|
3684
|
+
else: # "regression"
|
3685
|
+
dict_pred_stack = {
|
3686
|
+
"best_clf": stacking_model,
|
3687
|
+
"best_params": None,
|
3688
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3689
|
+
"predictions": y_pred_final.tolist(),
|
3690
|
+
"predictions_proba": (
|
3691
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
|
+
),
|
3693
|
+
}
|
3694
|
+
|
3695
|
+
else:
|
3696
|
+
if y_true is None:
|
3697
|
+
validation_scores_final = []
|
3698
|
+
else:
|
3699
|
+
validation_scores_final = cal_metrics(
|
3700
|
+
y_true,
|
3701
|
+
y_pred,
|
3702
|
+
y_pred_proba=y_pred_proba_final,
|
3703
|
+
is_binary=is_binary,
|
3704
|
+
purpose=purpose,
|
3705
|
+
average="weighted",
|
3706
|
+
)
|
3707
|
+
dict_pred_stack = {
|
3708
|
+
"best_clf": stacking_model,
|
3709
|
+
"best_params": None,
|
3710
|
+
"scores": validation_scores_final,
|
3711
|
+
"predictions": y_pred_final.tolist(),
|
3712
|
+
"predictions_proba": (
|
3713
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
|
+
),
|
3715
|
+
"y_train": y_train if y_train is not None else [],
|
3716
|
+
"y_true": y_true if y_true is not None else [],
|
3717
|
+
}
|
3718
|
+
# merge together
|
3719
|
+
df_pred = pd.DataFrame(
|
3720
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
|
3721
|
+
for k, v in dict_pred_stack.items():
|
3722
|
+
if k in df_pred.columns:
|
3723
|
+
df_pred[k] = [v]
|
3724
|
+
|
3725
|
+
# # plot the stacking
|
3726
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3727
|
+
# plot_validate_features_single(df_pred, is_binary=is_binary)
|
3728
|
+
# if dir_save:
|
3729
|
+
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
|
+
if vote:
|
3731
|
+
print(f" ⤵ voting...")
|
3732
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
+
#! Votting
|
3734
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
|
+
base_estimators=[]
|
3736
|
+
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
|
+
base_estimators.append((name,cls))
|
3738
|
+
# Apply Voting Classifier/Regressor
|
3739
|
+
if purpose == "classification":
|
3740
|
+
print(f" ⤵ VotingClassifier...via{votting}")
|
3741
|
+
if voting=='hard':
|
3742
|
+
# Hard voting does not support `predict_proba`
|
3743
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3744
|
+
else:
|
3745
|
+
# Soft voting supports `predict_proba`
|
3746
|
+
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
|
+
else:
|
3748
|
+
print(f" ⤵ VotingRegressor...")
|
3749
|
+
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
|
+
|
3751
|
+
# Train the Voting Classifier/Regressor
|
3752
|
+
try:
|
3753
|
+
voting_model.fit(x_train, y_train)
|
3754
|
+
y_pred_vote = voting_model.predict(x_true)
|
3755
|
+
except Exception as e:
|
3756
|
+
if purpose == "classification" and not voting=='hard':
|
3757
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3758
|
+
voting_model.fit(x_train, y_train)
|
3759
|
+
y_pred_vote = voting_model.predict(x_true)
|
3760
|
+
|
3761
|
+
# Calculate predicted probabilities if applicable
|
3762
|
+
if purpose == "classification":
|
3763
|
+
if hasattr(voting_model, "predict_proba"):
|
3764
|
+
y_pred_proba_vote = voting_model.predict_proba(x_true)
|
3765
|
+
print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
|
3766
|
+
if y_pred_proba_vote.shape[1] == 1:
|
3767
|
+
y_pred_proba_vote = np.hstack(
|
3768
|
+
[1 - y_pred_proba_vote, y_pred_proba_vote]
|
3769
|
+
) # Add missing class probabilities
|
3770
|
+
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
|
+
else:
|
3772
|
+
y_pred_proba_vote = None
|
3773
|
+
else: # Regression
|
3774
|
+
y_pred_proba_vote = None
|
3775
|
+
|
3776
|
+
print(f" ⤵ collecting voting results...")
|
3777
|
+
#! dict_pred_vote
|
3778
|
+
dict_pred_vote = {}
|
3779
|
+
validation_scores_vote = {}
|
3780
|
+
if y_true is not None and y_pred_proba_vote is not None:
|
3781
|
+
validation_scores_vote = cal_metrics(
|
3782
|
+
y_true,
|
3783
|
+
y_pred_vote,
|
3784
|
+
y_pred_proba=y_pred_proba_vote,
|
3785
|
+
is_binary=is_binary,
|
3786
|
+
purpose=purpose,
|
3787
|
+
average="weighted",
|
3788
|
+
)
|
3789
|
+
|
3790
|
+
if is_binary:
|
3791
|
+
if y_pred_proba_vote is not None:
|
3792
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
|
3793
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3794
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3795
|
+
)
|
3796
|
+
roc_auc = auc(fpr, tpr)
|
3797
|
+
roc_info = {
|
3798
|
+
"fpr": fpr.tolist(),
|
3799
|
+
"tpr": tpr.tolist(),
|
3800
|
+
"auc": roc_auc,
|
3801
|
+
"ci95": (lower_ci, upper_ci),
|
3802
|
+
}
|
3803
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
|
3804
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
|
3805
|
+
pr_info = {
|
3806
|
+
"precision": precision_,
|
3807
|
+
"recall": recall_,
|
3808
|
+
"avg_precision": avg_precision_,
|
3809
|
+
}
|
3810
|
+
else:
|
3811
|
+
roc_info, pr_info = None, None
|
3812
|
+
|
3813
|
+
dict_pred_vote = {
|
3814
|
+
"best_clf": voting_model,
|
3815
|
+
"best_params": None,
|
3816
|
+
"auc_indiv": None,
|
3817
|
+
"scores": validation_scores_vote,
|
3818
|
+
"roc_curve": roc_info,
|
3819
|
+
"pr_curve": pr_info,
|
3820
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3821
|
+
"predictions": y_pred_vote.tolist(),
|
3822
|
+
"predictions_proba": (
|
3823
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
|
+
),
|
3825
|
+
}
|
3826
|
+
else: # Multi-class
|
3827
|
+
if y_pred_proba_vote is not None:
|
3828
|
+
confidence_intervals = cal_auc_ci(
|
3829
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3830
|
+
)
|
3831
|
+
roc_info = {
|
3832
|
+
"fpr": validation_scores_vote["fpr"],
|
3833
|
+
"tpr": validation_scores_vote["tpr"],
|
3834
|
+
"auc": validation_scores_vote["roc_auc_by_class"],
|
3835
|
+
"ci95": confidence_intervals,
|
3836
|
+
}
|
3837
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3838
|
+
y_true, y_pred_proba_vote, is_binary=is_binary
|
3839
|
+
)
|
3840
|
+
pr_info = {
|
3841
|
+
"precision": precision_,
|
3842
|
+
"recall": recall_,
|
3843
|
+
"avg_precision": avg_precision_,
|
3844
|
+
}
|
3845
|
+
else:
|
3846
|
+
roc_info, pr_info = None, None
|
3847
|
+
|
3848
|
+
dict_pred_vote = {
|
3849
|
+
"best_clf": voting_model,
|
3850
|
+
"best_params": None,
|
3851
|
+
"scores": validation_scores_vote,
|
3852
|
+
"roc_curve": roc_info,
|
3853
|
+
"pr_curve": pr_info,
|
3854
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3855
|
+
"predictions": y_pred_vote.tolist(),
|
3856
|
+
"predictions_proba": (
|
3857
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
|
+
),
|
3859
|
+
}
|
3860
|
+
else:
|
3861
|
+
if y_true is None:
|
3862
|
+
validation_scores_vote = []
|
3863
|
+
else:
|
3864
|
+
validation_scores_vote = cal_metrics(
|
3865
|
+
y_true,
|
3866
|
+
y_pred,
|
3867
|
+
y_pred_proba=y_pred_proba_vote,
|
3868
|
+
is_binary=is_binary,
|
3869
|
+
purpose=purpose,
|
3870
|
+
average="weighted",
|
3871
|
+
)
|
3872
|
+
dict_pred_vote = {
|
3873
|
+
"best_clf": voting_model,
|
3874
|
+
"best_params": None,
|
3875
|
+
"scores": validation_scores_vote,
|
3876
|
+
"predictions": y_pred_vote.tolist(),
|
3877
|
+
"predictions_proba": (
|
3878
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
|
+
),
|
3880
|
+
"y_train": y_train if y_train is not None else [],
|
3881
|
+
"y_true": y_true if y_true is not None else [],
|
3882
|
+
}
|
3883
|
+
df_vote = pd.DataFrame(
|
3884
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
|
3885
|
+
for k, v in dict_pred_vote.items():
|
3886
|
+
if k in df_vote.columns:
|
3887
|
+
df_vote[k] = [v]
|
3888
|
+
|
3889
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3890
|
+
# try:
|
3891
|
+
# plot_validate_features_single(df_vote, is_binary=is_binary)
|
3892
|
+
# if dir_save:
|
3893
|
+
# ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
|
3894
|
+
# except Exception as e:
|
3895
|
+
# print(e)
|
3896
|
+
print("Done")
|
3897
|
+
if vote and stack:
|
3898
|
+
df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
|
3899
|
+
elif vote:
|
3900
|
+
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
3901
|
+
elif stack:
|
3902
|
+
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
3903
|
+
|
3217
3904
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3905
|
+
from datetime import datetime
|
3906
|
+
|
3907
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3218
3908
|
# try:
|
3219
|
-
if
|
3220
|
-
plot_validate_features(
|
3909
|
+
if df_res.shape[0] > 3:
|
3910
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
3221
3911
|
else:
|
3222
|
-
plot_validate_features_single(
|
3912
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
3223
3913
|
if dir_save:
|
3224
3914
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3225
|
-
|
3226
|
-
|
3227
|
-
return
|
3228
|
-
|
3915
|
+
# except Exception as e:
|
3916
|
+
# print(f"Error: 在画图的过程中出现了问题:{e}")
|
3917
|
+
return df_res
|
3229
3918
|
|
3230
3919
|
def cal_metrics(
|
3231
3920
|
y_true,
|
@@ -3367,7 +4056,7 @@ def cal_metrics(
|
|
3367
4056
|
|
3368
4057
|
|
3369
4058
|
def plot_trees(
|
3370
|
-
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
4059
|
+
X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3371
4060
|
):
|
3372
4061
|
"""
|
3373
4062
|
# # Example usage:
|
@@ -3413,10 +4102,14 @@ def plot_trees(
|
|
3413
4102
|
train_error_rate = []
|
3414
4103
|
test_error_rate = []
|
3415
4104
|
validation_error = None
|
3416
|
-
|
4105
|
+
if isinstance(cls, str):
|
4106
|
+
cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
|
3417
4107
|
# Configure classifier based on type
|
3418
4108
|
oob_enabled = False # Default to no OOB error unless explicitly set
|
3419
|
-
|
4109
|
+
clf_support = {"RandomForestClassifier":RandomForestClassifier(),
|
4110
|
+
"ExtraTreesClassifier":ExtraTreesClassifier(),
|
4111
|
+
"AdaBoostClassifier":AdaBoostClassifier(),
|
4112
|
+
"GradientBoostingClassifier":GradientBoostingClassifier()}
|
3420
4113
|
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3421
4114
|
# Enable OOB if cls supports it and is using bootstrapping
|
3422
4115
|
cls.set_params(warm_start=True, n_estimators=1)
|
@@ -3678,7 +4371,7 @@ def img_datasets_preprocessing(
|
|
3678
4371
|
|
3679
4372
|
|
3680
4373
|
def backward_regression(
|
3681
|
-
X: pd.DataFrame, y: pd.Series, initial_list=[],
|
4374
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
|
3682
4375
|
):
|
3683
4376
|
"""
|
3684
4377
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
@@ -3690,31 +4383,46 @@ def backward_regression(
|
|
3690
4383
|
X -- features values
|
3691
4384
|
y -- target variable
|
3692
4385
|
initial_list -- features header
|
3693
|
-
|
4386
|
+
thr -- pvalue threshold of features to drop
|
3694
4387
|
verbose -- true to produce lots of logging output
|
3695
4388
|
|
3696
4389
|
Returns:
|
3697
4390
|
list of selected features for modeling
|
3698
4391
|
"""
|
3699
4392
|
import statsmodels.api as sm
|
3700
|
-
|
3701
|
-
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
4393
|
+
if isinstance(y, str):
|
4394
|
+
if y in X.columns:
|
4395
|
+
y_col_name = y
|
4396
|
+
y = X[y]
|
4397
|
+
X = X.drop(y_col_name, axis=1)
|
4398
|
+
else:
|
4399
|
+
raise ValueError(f"找不到{y},y设置有误")
|
4400
|
+
X = X.select_dtypes(include=[np.number])
|
4401
|
+
|
3705
4402
|
included = list(X.columns)
|
4403
|
+
try:
|
4404
|
+
X=X.astype(float)
|
4405
|
+
y=y.astype(float)
|
4406
|
+
except Exception as e:
|
4407
|
+
raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
|
4408
|
+
|
4409
|
+
|
3706
4410
|
while True:
|
3707
4411
|
changed = False
|
4412
|
+
if not included:
|
4413
|
+
print("No features remain in the model.")
|
4414
|
+
break
|
4415
|
+
|
3708
4416
|
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3709
4417
|
# exclude the intercept for p-value checking
|
3710
4418
|
pvalues = model.pvalues.iloc[1:]
|
3711
4419
|
worst_pval = pvalues.max()
|
3712
|
-
if worst_pval >
|
4420
|
+
if worst_pval > thr:
|
3713
4421
|
changed = True
|
3714
4422
|
worst_feature = pvalues.idxmax()
|
3715
4423
|
included.remove(worst_feature)
|
3716
4424
|
if verbose:
|
3717
|
-
print(f"Removing
|
4425
|
+
print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
|
3718
4426
|
if not changed:
|
3719
4427
|
break
|
3720
4428
|
print(f"\nSelected Features:\n{included}")
|