py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ips.py +213 -195
- py2ls/ml2ls.py +774 -66
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
|
|
5
5
|
BaggingClassifier,
|
6
6
|
)
|
7
7
|
from sklearn.svm import SVC, SVR
|
8
|
-
from sklearn.calibration import CalibratedClassifierCV
|
9
8
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
10
9
|
from sklearn.linear_model import (
|
11
10
|
LassoCV,
|
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
|
|
16
15
|
RidgeClassifierCV,
|
17
16
|
ElasticNet,
|
18
17
|
)
|
19
|
-
|
20
|
-
from sklearn.naive_bayes import GaussianNB
|
21
|
-
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
22
|
-
import xgboost as xgb # Make sure you have xgboost installed
|
23
|
-
|
24
|
-
from sklearn.model_selection import train_test_split, cross_val_score
|
18
|
+
|
25
19
|
from sklearn.metrics import (
|
26
20
|
accuracy_score,
|
27
21
|
precision_score,
|
@@ -36,18 +30,12 @@ from sklearn.metrics import (
|
|
36
30
|
precision_recall_curve,
|
37
31
|
average_precision_score,
|
38
32
|
)
|
39
|
-
from imblearn.over_sampling import SMOTE
|
40
|
-
from sklearn.pipeline import Pipeline
|
41
|
-
from collections import defaultdict
|
42
|
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
43
33
|
from typing import Dict, Any, Optional, List, Union
|
44
34
|
import numpy as np
|
45
35
|
import pandas as pd
|
46
36
|
from . import ips
|
47
37
|
from . import plot
|
48
38
|
import matplotlib.pyplot as plt
|
49
|
-
import seaborn as sns
|
50
|
-
|
51
39
|
plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
|
52
40
|
import logging
|
53
41
|
import warnings
|
@@ -314,6 +302,8 @@ def features_svm(
|
|
314
302
|
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
315
303
|
S-shaped relationships.
|
316
304
|
"""
|
305
|
+
from sklearn.feature_selection import RFE
|
306
|
+
from sklearn.svm import SVC
|
317
307
|
# SVM (Support Vector Machines)
|
318
308
|
svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
|
319
309
|
# RFE(Recursive Feature Elimination)
|
@@ -450,6 +440,7 @@ def validate_classifier(
|
|
450
440
|
Returns:
|
451
441
|
- results: Dictionary containing average cv_train_scores and cv_test_scores.
|
452
442
|
"""
|
443
|
+
from sklearn.model_selection import cross_val_score
|
453
444
|
cv_train_scores = {metric: [] for metric in metrics}
|
454
445
|
skf = StratifiedKFold(n_splits=cv_folds)
|
455
446
|
# Perform cross-validation
|
@@ -982,6 +973,8 @@ def validate_features(
|
|
982
973
|
|
983
974
|
"""
|
984
975
|
from tqdm import tqdm
|
976
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
977
|
+
from sklearn.calibration import CalibratedClassifierCV
|
985
978
|
|
986
979
|
# Ensure common features are selected
|
987
980
|
common_features = ips.shared(
|
@@ -1001,6 +994,7 @@ def validate_features(
|
|
1001
994
|
|
1002
995
|
# Handle class imbalance using SMOTE
|
1003
996
|
if smote:
|
997
|
+
from imblearn.over_sampling import SMOTE
|
1004
998
|
if (
|
1005
999
|
y_train.value_counts(normalize=True).max() < 0.8
|
1006
1000
|
): # Threshold to decide if data is imbalanced
|
@@ -2096,7 +2090,116 @@ def rank_models(
|
|
2096
2090
|
# )
|
2097
2091
|
|
2098
2092
|
# figsave("classifier_performance.pdf")
|
2093
|
+
def rank_models_reg(df, ascending=False):
|
2094
|
+
"""
|
2095
|
+
Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
|
2099
2096
|
|
2097
|
+
Parameters:
|
2098
|
+
df (pd.DataFrame): DataFrame containing the regression metrics.
|
2099
|
+
ascending (bool): Whether to sort in ascending order of ranking score.
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
|
2103
|
+
"""
|
2104
|
+
# Define weights for the 4 metrics
|
2105
|
+
weights = {
|
2106
|
+
"mse": -1, # Lower is better
|
2107
|
+
"rmse": -1, # Lower is better
|
2108
|
+
"mae": -1, # Lower is better
|
2109
|
+
"r2": 1, # Higher is better
|
2110
|
+
}
|
2111
|
+
|
2112
|
+
# Normalize the selected metrics
|
2113
|
+
df = df.copy() # Work on a copy of the DataFrame
|
2114
|
+
for metric, weight in weights.items():
|
2115
|
+
if metric in df.columns:
|
2116
|
+
if weight > 0: # Higher is better; normalize 0-1
|
2117
|
+
df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
|
2118
|
+
df[metric].max() - df[metric].min()
|
2119
|
+
)
|
2120
|
+
else: # Lower is better; reverse normalize 0-1
|
2121
|
+
df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
|
2122
|
+
df[metric].max() - df[metric].min()
|
2123
|
+
)
|
2124
|
+
|
2125
|
+
# Calculate ranking score as a weighted sum
|
2126
|
+
df["Ranking_Score"] = sum(
|
2127
|
+
df[metric + "_normalized"] * abs(weights[metric])
|
2128
|
+
for metric in weights.keys()
|
2129
|
+
if metric + "_normalized" in df.columns
|
2130
|
+
)
|
2131
|
+
|
2132
|
+
# Sort models based on the ranking score
|
2133
|
+
sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
|
2134
|
+
return sorted_df
|
2135
|
+
|
2136
|
+
models_support = {
|
2137
|
+
"classification": {
|
2138
|
+
"Random Forest": "Tree-Based",
|
2139
|
+
"SVM": "Kernel-Based",
|
2140
|
+
"Logistic Regression": "Linear",
|
2141
|
+
"Lasso Logistic Regression": "Linear",
|
2142
|
+
"Gradient Boosting": "Tree-Based",
|
2143
|
+
"XGBoost": "Tree-Based",
|
2144
|
+
"KNN": "Instance-Based",
|
2145
|
+
"Naive Bayes": "Probabilistic",
|
2146
|
+
"Linear Discriminant Analysis": "Linear",
|
2147
|
+
"AdaBoost": "Tree-Based",
|
2148
|
+
"CatBoost": "Tree-Based",
|
2149
|
+
"Extra Trees": "Tree-Based",
|
2150
|
+
"Bagging": "Tree-Based",
|
2151
|
+
"Neural Network": "Neural Network",
|
2152
|
+
"DecisionTree": "Tree-Based",
|
2153
|
+
"Quadratic Discriminant Analysis": "Probabilistic",
|
2154
|
+
"Ridge": "Linear",
|
2155
|
+
"Perceptron": "Linear",
|
2156
|
+
"Bernoulli Naive Bayes": "Probabilistic",
|
2157
|
+
"SGDClassifier": "Linear",
|
2158
|
+
},
|
2159
|
+
"regression": {
|
2160
|
+
"Linear Regression": "Linear",
|
2161
|
+
"Ridge": "Linear",
|
2162
|
+
"RidgeCV": "Linear",
|
2163
|
+
"TheilSenRegressor": "Linear",
|
2164
|
+
"HuberRegressor": "Linear",
|
2165
|
+
"PoissonRegressor": "Linear",
|
2166
|
+
"LassoCV": "Linear",
|
2167
|
+
"Bagging": "Tree-Based",
|
2168
|
+
"ElasticNet": "Linear",
|
2169
|
+
"Random Forest": "Tree-Based",
|
2170
|
+
"Gradient Boosting": "Tree-Based",
|
2171
|
+
"XGBoost": "Tree-Based",
|
2172
|
+
"CatBoost": "Tree-Based",
|
2173
|
+
"Extra Trees": "Tree-Based",
|
2174
|
+
"SVM": "Kernel-Based",
|
2175
|
+
"KNN": "Instance-Based",
|
2176
|
+
"Neural Network": "Neural Network",
|
2177
|
+
"AdaBoost": "Linear",
|
2178
|
+
},
|
2179
|
+
}
|
2180
|
+
def select_top_models(models, categories, n_top_models, n_models_per_category=1):
|
2181
|
+
"""
|
2182
|
+
models = list_sort
|
2183
|
+
purpose = "regression"
|
2184
|
+
categories = models_support[purpose]
|
2185
|
+
n_top_models = 3
|
2186
|
+
select_top_models(models, categories, n_top_models)
|
2187
|
+
"""
|
2188
|
+
selected = {}
|
2189
|
+
result = []
|
2190
|
+
for model in models:
|
2191
|
+
category = categories.get(model, "Unknown")
|
2192
|
+
if category not in selected:
|
2193
|
+
selected[category] = 0 # Initialize counter for the category
|
2194
|
+
|
2195
|
+
if selected[category] < n_models_per_category: # Allow additional models up to the limit
|
2196
|
+
selected[category] += 1
|
2197
|
+
result.append(model)
|
2198
|
+
|
2199
|
+
if len(result) == n_top_models: # Stop when the desired number of models is reached
|
2200
|
+
break
|
2201
|
+
|
2202
|
+
return result
|
2100
2203
|
|
2101
2204
|
def predict(
|
2102
2205
|
x_train: pd.DataFrame,
|
@@ -2104,11 +2207,17 @@ def predict(
|
|
2104
2207
|
x_true: pd.DataFrame = None,
|
2105
2208
|
y_true: Optional[pd.Series] = None,
|
2106
2209
|
backward: bool = False, # backward_regression
|
2210
|
+
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2107
2211
|
common_features: set = None,
|
2108
2212
|
purpose: str = "classification", # 'classification' or 'regression'
|
2109
2213
|
cls: Optional[Dict[str, Any]] = None,
|
2110
2214
|
metrics: Optional[List[str]] = None,
|
2111
|
-
|
2215
|
+
stack:bool=True,# run stacking
|
2216
|
+
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2217
|
+
vote:bool=True,# run voting
|
2218
|
+
voting:str="hard", # only for classification purporse of voting
|
2219
|
+
n_top_models:int=5, #for stacking models
|
2220
|
+
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
2112
2221
|
smote: bool = False,
|
2113
2222
|
n_jobs: int = -1,
|
2114
2223
|
plot_: bool = True,
|
@@ -2117,6 +2226,7 @@ def predict(
|
|
2117
2226
|
cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
|
2118
2227
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2119
2228
|
class_weight: str = "balanced",
|
2229
|
+
random_state: int = 1,
|
2120
2230
|
verbose: bool = False,
|
2121
2231
|
) -> pd.DataFrame:
|
2122
2232
|
"""
|
@@ -2184,10 +2294,17 @@ def predict(
|
|
2184
2294
|
RidgeClassifierCV,
|
2185
2295
|
Perceptron,
|
2186
2296
|
SGDClassifier,
|
2297
|
+
RidgeCV,
|
2298
|
+
Ridge,
|
2299
|
+
TheilSenRegressor,
|
2300
|
+
HuberRegressor,
|
2301
|
+
PoissonRegressor,
|
2302
|
+
|
2187
2303
|
)
|
2304
|
+
from sklearn.compose import TransformedTargetRegressor
|
2188
2305
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
2189
2306
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
2190
|
-
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
2307
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
|
2191
2308
|
import xgboost as xgb
|
2192
2309
|
import lightgbm as lgb
|
2193
2310
|
import catboost as cb
|
@@ -2198,6 +2315,7 @@ def predict(
|
|
2198
2315
|
QuadraticDiscriminantAnalysis,
|
2199
2316
|
)
|
2200
2317
|
from sklearn.preprocessing import PolynomialFeatures
|
2318
|
+
from sklearn.model_selection import train_test_split
|
2201
2319
|
|
2202
2320
|
# 拼写检查
|
2203
2321
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
@@ -2261,7 +2379,6 @@ def predict(
|
|
2261
2379
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2262
2380
|
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2263
2381
|
"Linear Regression": LinearRegression(),
|
2264
|
-
"Lasso": Lasso(random_state=random_state),
|
2265
2382
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2266
2383
|
# "LightGBM": lgb.LGBMRegressor(random_state=random_state),
|
2267
2384
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
@@ -2271,10 +2388,10 @@ def predict(
|
|
2271
2388
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2272
2389
|
"Ridge": Ridge(),
|
2273
2390
|
"KNN": KNeighborsRegressor(),
|
2391
|
+
"TheilSen":TheilSenRegressor(),
|
2392
|
+
"Huber":HuberRegressor(),
|
2393
|
+
"Poisson":PoissonRegressor()
|
2274
2394
|
}
|
2275
|
-
# indicate cls:
|
2276
|
-
if ips.run_once_within(30): # 10 min
|
2277
|
-
print(f"supported models: {list(model_.keys())}")
|
2278
2395
|
if cls is None:
|
2279
2396
|
models = model_
|
2280
2397
|
else:
|
@@ -2290,6 +2407,10 @@ def predict(
|
|
2290
2407
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2291
2408
|
)
|
2292
2409
|
|
2410
|
+
# indicate cls:
|
2411
|
+
if ips.run_once_within(30): # 10 min
|
2412
|
+
print(f"processing: {list(models.keys())}")
|
2413
|
+
|
2293
2414
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2294
2415
|
y_train_col_name = y_train
|
2295
2416
|
y_train = x_train[y_train]
|
@@ -2311,7 +2432,7 @@ def predict(
|
|
2311
2432
|
|
2312
2433
|
# Perform backward feature selection
|
2313
2434
|
if backward:
|
2314
|
-
selected_features = backward_regression(x_train, y_train,
|
2435
|
+
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
2315
2436
|
x_train = x_train[selected_features]
|
2316
2437
|
|
2317
2438
|
if x_true is None:
|
@@ -2391,10 +2512,22 @@ def predict(
|
|
2391
2512
|
if isinstance(y_train, np.ndarray):
|
2392
2513
|
y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
|
2393
2514
|
y_train = np.asarray(y_train)
|
2394
|
-
if
|
2395
|
-
|
2396
|
-
|
2515
|
+
if y_true is not None:
|
2516
|
+
if isinstance(y_train, np.ndarray):
|
2517
|
+
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2518
|
+
y_true = np.asarray(y_true)
|
2397
2519
|
# Hyperparameter grids for tuning
|
2520
|
+
param_grid_common_xgb = {
|
2521
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2522
|
+
'max_depth': [3, 5, 7, 10],
|
2523
|
+
'n_estimators': [50, 100, 200, 300],
|
2524
|
+
'subsample': [0.6, 0.8, 1.0],
|
2525
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2526
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2527
|
+
'min_child_weight': [1, 5, 10],
|
2528
|
+
'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
|
2529
|
+
'reg_lambda': [1, 1.5, 2], # L2 regularization term
|
2530
|
+
}
|
2398
2531
|
if cv_level in ["low", "simple", "s", "l"]:
|
2399
2532
|
param_grids = {
|
2400
2533
|
"Random Forest": (
|
@@ -2416,8 +2549,8 @@ def predict(
|
|
2416
2549
|
}
|
2417
2550
|
),
|
2418
2551
|
"SVM": {
|
2419
|
-
"C": [1],
|
2420
|
-
"gamma": ["scale"],
|
2552
|
+
"C": [0.1, 1, 10],
|
2553
|
+
"gamma": ["scale", 0.1, 1],
|
2421
2554
|
"kernel": ["rbf"],
|
2422
2555
|
},
|
2423
2556
|
"Lasso": {
|
@@ -2439,12 +2572,17 @@ def predict(
|
|
2439
2572
|
"min_samples_split": [2],
|
2440
2573
|
"subsample": [0.8],
|
2441
2574
|
},
|
2442
|
-
"XGBoost":
|
2443
|
-
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2447
|
-
|
2575
|
+
"XGBoost":{
|
2576
|
+
'learning_rate': [0.01],
|
2577
|
+
'max_depth': [3],
|
2578
|
+
'n_estimators': [50],
|
2579
|
+
'subsample': [0.6],
|
2580
|
+
'colsample_bytree': [0.6],
|
2581
|
+
'gamma': [0, 0.1],
|
2582
|
+
'min_child_weight': [1],
|
2583
|
+
'reg_alpha': [0, 0.1],
|
2584
|
+
'reg_lambda': [1],
|
2585
|
+
'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
|
2448
2586
|
},
|
2449
2587
|
"KNN": (
|
2450
2588
|
{
|
@@ -2551,6 +2689,14 @@ def predict(
|
|
2551
2689
|
"random_state": [random_state],
|
2552
2690
|
"learning_rate": ["constant"],
|
2553
2691
|
},
|
2692
|
+
"TheilSen":{'max_iter': [100],
|
2693
|
+
'tol': [1e-4],
|
2694
|
+
'n_subsamples': [100+x_train.shape[1]]},
|
2695
|
+
"Huber":{'epsilon': [1.35],
|
2696
|
+
'alpha': [0.1],
|
2697
|
+
'max_iter': [100],},
|
2698
|
+
"Poisson":{'alpha': [0.1],
|
2699
|
+
'max_iter': [100],}
|
2554
2700
|
}
|
2555
2701
|
elif cv_level in ["high", "advanced", "h"]:
|
2556
2702
|
param_grids = {
|
@@ -2612,12 +2758,30 @@ def predict(
|
|
2612
2758
|
"subsample": [0.8, 1.0],
|
2613
2759
|
},
|
2614
2760
|
"XGBoost": {
|
2615
|
-
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2761
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2762
|
+
'max_depth': [3, 5, 7, 10],
|
2763
|
+
'n_estimators': [50, 100, 200, 300],
|
2764
|
+
'subsample': [0.6, 0.8, 1.0],
|
2765
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2766
|
+
'min_child_weight': [1, 5, 10],
|
2767
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2768
|
+
'reg_lambda': [1, 1.5, 2],
|
2769
|
+
**{
|
2770
|
+
'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
|
2771
|
+
}} if purpose== "classification"
|
2772
|
+
else{
|
2773
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2774
|
+
'max_depth': [3, 5, 7, 10],
|
2775
|
+
'n_estimators': [50, 100, 200, 300],
|
2776
|
+
'subsample': [0.6, 0.8, 1.0],
|
2777
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2778
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2779
|
+
'min_child_weight': [1, 5, 10],
|
2780
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2781
|
+
'reg_lambda': [1, 1.5, 2],
|
2782
|
+
**{
|
2783
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
|
2784
|
+
}},
|
2621
2785
|
"KNN": (
|
2622
2786
|
{
|
2623
2787
|
"n_neighbors": [1, 3, 5, 10, 15, 20],
|
@@ -2730,6 +2894,14 @@ def predict(
|
|
2730
2894
|
], # If True, the regressors X will be normalized
|
2731
2895
|
}
|
2732
2896
|
),
|
2897
|
+
"TheilSen":{'max_iter': [100, 200, 300],
|
2898
|
+
'tol': [1e-4, 1e-3, 1e-2],
|
2899
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
|
2900
|
+
"Huber":{'epsilon': [1.35, 1.5, 2.0],
|
2901
|
+
'alpha': [0.1, 1.0, 10.0],
|
2902
|
+
'max_iter': [100, 200, 300],},
|
2903
|
+
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2904
|
+
'max_iter': [100, 200, 300],}
|
2733
2905
|
}
|
2734
2906
|
else: # median level
|
2735
2907
|
param_grids = {
|
@@ -2789,12 +2961,30 @@ def predict(
|
|
2789
2961
|
"subsample": [0.8, 1.0],
|
2790
2962
|
},
|
2791
2963
|
"XGBoost": {
|
2792
|
-
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
|
2797
|
-
|
2964
|
+
'learning_rate': [0.01, 0.1],
|
2965
|
+
'max_depth': [3, 5],
|
2966
|
+
'n_estimators': [50, 100],
|
2967
|
+
'subsample': [0.6, 0.8],
|
2968
|
+
'gamma': [0, 0.1],
|
2969
|
+
'min_child_weight': [1, 5],
|
2970
|
+
'reg_alpha': [0, 0.1],
|
2971
|
+
'reg_lambda': [1,],
|
2972
|
+
**{
|
2973
|
+
'objective': ['binary:logistic', 'multi:softmax'],
|
2974
|
+
}} if purpose== "classification"
|
2975
|
+
else{
|
2976
|
+
'learning_rate': [0.01, 0.1],
|
2977
|
+
'max_depth': [3, 5,],
|
2978
|
+
'n_estimators': [50, 100],
|
2979
|
+
'subsample': [0.6, 0.8],
|
2980
|
+
'colsample_bytree': [0.6, 0.8],
|
2981
|
+
'gamma': [0, 0.1],
|
2982
|
+
'min_child_weight': [1, 5],
|
2983
|
+
'reg_alpha': [0, 0.1],
|
2984
|
+
'reg_lambda': [1, 1.5],
|
2985
|
+
**{
|
2986
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror'],
|
2987
|
+
}},
|
2798
2988
|
"KNN": (
|
2799
2989
|
{
|
2800
2990
|
"n_neighbors": [3, 5, 7, 10],
|
@@ -2951,6 +3141,14 @@ def predict(
|
|
2951
3141
|
], # Solver for optimization
|
2952
3142
|
}
|
2953
3143
|
),
|
3144
|
+
"TheilSen":{'max_iter': [100, 200],
|
3145
|
+
'tol': [1e-4, 1e-3],
|
3146
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
|
3147
|
+
"Huber":{'epsilon': [1.35, 1.5],
|
3148
|
+
'alpha': [0.1, 1.0],
|
3149
|
+
'max_iter': [100, 200],},
|
3150
|
+
"Poisson":{'alpha': [0.1, 1.0],
|
3151
|
+
'max_iter': [100, 200],}
|
2954
3152
|
}
|
2955
3153
|
|
2956
3154
|
results = {}
|
@@ -3191,12 +3389,18 @@ def predict(
|
|
3191
3389
|
# Convert results to DataFrame
|
3192
3390
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3193
3391
|
# sort
|
3194
|
-
if y_true is not None
|
3195
|
-
|
3196
|
-
|
3197
|
-
|
3392
|
+
if y_true is not None:
|
3393
|
+
if purpose == "classification":
|
3394
|
+
df_scores = pd.DataFrame(
|
3395
|
+
df_results["scores"].tolist(), index=df_results["scores"].index
|
3396
|
+
).sort_values(by="roc_auc", ascending=False)
|
3397
|
+
elif purpose=='regression':
|
3398
|
+
df_scores = rank_models_reg(
|
3399
|
+
pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
|
3400
|
+
ascending=False)
|
3198
3401
|
df_results = df_results.loc[df_scores.index]
|
3199
3402
|
|
3403
|
+
if y_true is not None and purpose == "classification":
|
3200
3404
|
if plot_:
|
3201
3405
|
from datetime import datetime
|
3202
3406
|
|
@@ -3214,18 +3418,503 @@ def predict(
|
|
3214
3418
|
plot.figsets(xangle=30)
|
3215
3419
|
if dir_save:
|
3216
3420
|
ips.figsave(dir_save + f"scores_clus{now_}.pdf")
|
3421
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3422
|
+
# # try:
|
3423
|
+
# if len(models) > 3:
|
3424
|
+
# plot_validate_features(df_results, is_binary=is_binary)
|
3425
|
+
# else:
|
3426
|
+
# plot_validate_features_single(df_results, is_binary=is_binary)
|
3427
|
+
# if dir_save:
|
3428
|
+
# ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3429
|
+
# # except Exception as e:
|
3430
|
+
# # print(f"Error: 在画图的过程中出现了问题:{e}")
|
3431
|
+
if stack:
|
3432
|
+
#! stacking classifier/regressor
|
3433
|
+
from sklearn.metrics import make_scorer, accuracy_score
|
3434
|
+
from sklearn.model_selection import cross_val_score
|
3435
|
+
|
3436
|
+
#* n_top_models防止超过index
|
3437
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3438
|
+
|
3439
|
+
#* 选择出排名靠前的n个, estimators
|
3440
|
+
models_selecte = select_top_models(models=list(df_results.index),
|
3441
|
+
categories=models_support[purpose],
|
3442
|
+
n_top_models=n_top_models,
|
3443
|
+
n_models_per_category=n_models_per_category)
|
3444
|
+
top_models = df_results.loc[models_selecte]["best_clf"]
|
3445
|
+
base_estimators = []
|
3446
|
+
for i, j in top_models.to_dict().items():
|
3447
|
+
base_estimators.append((i, j))
|
3448
|
+
if stacking_cv:
|
3449
|
+
print(f" ⤵ stacking_cv is processing...")
|
3450
|
+
#* 定义几个象征性的final_estimator
|
3451
|
+
# 备选的几种
|
3452
|
+
if purpose == "classification":
|
3453
|
+
kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
|
3454
|
+
else:
|
3455
|
+
kadt_estimators=["XGBoost","LassoCV"]
|
3456
|
+
final_estimators={}
|
3457
|
+
for name in kadt_estimators:
|
3458
|
+
param_grid=param_grids.get(name, {})
|
3459
|
+
print(param_grid)
|
3460
|
+
if is_binary:
|
3461
|
+
gs = GridSearchCV(
|
3462
|
+
model_[name],
|
3463
|
+
param_grid=param_grid,
|
3464
|
+
scoring=(
|
3465
|
+
"roc_auc"
|
3466
|
+
if purpose == "classification"
|
3467
|
+
else "neg_mean_squared_error"
|
3468
|
+
),
|
3469
|
+
cv=cv,
|
3470
|
+
n_jobs=n_jobs,
|
3471
|
+
verbose=verbose,
|
3472
|
+
)
|
3473
|
+
else:
|
3474
|
+
gs = GridSearchCV(
|
3475
|
+
model_[name],
|
3476
|
+
param_grid=param_grid,
|
3477
|
+
scoring=(
|
3478
|
+
"roc_auc_ovr"
|
3479
|
+
if purpose == "classification"
|
3480
|
+
else "neg_mean_squared_error"
|
3481
|
+
),
|
3482
|
+
cv=cv,
|
3483
|
+
n_jobs=n_jobs,
|
3484
|
+
verbose=verbose,
|
3485
|
+
)
|
3486
|
+
# Fit GridSearchCV
|
3487
|
+
gs.fit(x_train, y_train)
|
3488
|
+
final_estimators[name]=gs.best_estimator_
|
3489
|
+
|
3490
|
+
#* Set up cross-validation and performance evaluation
|
3491
|
+
scorer = make_scorer(accuracy_score)
|
3492
|
+
cv_results = []
|
3493
|
+
|
3494
|
+
#*Cross-validate stacking models with different final estimators
|
3495
|
+
for final_name, final_estimator in final_estimators.items():
|
3496
|
+
print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
|
3497
|
+
if purpose == "classification":
|
3498
|
+
stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
|
3499
|
+
else:
|
3500
|
+
stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
|
3501
|
+
|
3502
|
+
scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
|
3503
|
+
|
3504
|
+
# Store the result
|
3505
|
+
cv_results.append({
|
3506
|
+
'final_estimator':final_estimator,
|
3507
|
+
'Final Estimator': final_name,
|
3508
|
+
'Mean Accuracy': np.mean(scores),
|
3509
|
+
'Standard Deviation': np.std(scores)
|
3510
|
+
})
|
3511
|
+
|
3512
|
+
#* Convert the results into a DataFrame for easy comparison
|
3513
|
+
cv_results_df = pd.DataFrame(cv_results)
|
3514
|
+
|
3515
|
+
#* Sort and display the best model
|
3516
|
+
cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
|
3517
|
+
|
3518
|
+
|
3519
|
+
# Optionally: Select the final estimator that gives the best performance
|
3520
|
+
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
|
+
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
|
+
else:
|
3523
|
+
print(f" ⤵ trying to find the best_final_estimator for stacking...")
|
3524
|
+
if purpose=="classification":
|
3525
|
+
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
|
+
random_state=random_state,
|
3527
|
+
max_iter=1000)
|
3528
|
+
else:
|
3529
|
+
best_final_estimator = RidgeCV(cv=5)
|
3530
|
+
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
|
+
#! apply stacking
|
3532
|
+
if purpose == "classification":
|
3533
|
+
print(f" ⤵ StackingClassifier...")
|
3534
|
+
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
|
+
final_estimator=best_final_estimator,
|
3536
|
+
cv=cv)
|
3537
|
+
else:
|
3538
|
+
print(f" ⤵ StackingRegressor...")
|
3539
|
+
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
|
+
final_estimator=best_final_estimator,
|
3541
|
+
cv=cv)
|
3542
|
+
|
3543
|
+
# Train the Stacking Classifier
|
3544
|
+
print(f" ⤵ fit & predict...")
|
3545
|
+
stacking_model.fit(x_train, y_train)
|
3546
|
+
y_pred_final = stacking_model.predict(x_true)
|
3547
|
+
print(f" ⤵ collecting results...")
|
3548
|
+
# pred_proba
|
3549
|
+
if is_binary:
|
3550
|
+
if hasattr(stacking_model, "predict_proba"):
|
3551
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
+
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
|
+
if y_pred_proba_final.shape[1] == 1:
|
3554
|
+
y_pred_proba_final = np.hstack(
|
3555
|
+
[1 - y_pred_proba_final, y_pred_proba_final]
|
3556
|
+
) # Add missing class probabilities
|
3557
|
+
y_pred_proba_final = y_pred_proba_final[:, 1]
|
3558
|
+
elif hasattr(stacking_model, "decision_function"):
|
3559
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3560
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3561
|
+
# Ensure y_pred_proba_final is within 0 and 1 bounds
|
3562
|
+
y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
|
3563
|
+
y_pred_proba_final.max() - y_pred_proba_final.min()
|
3564
|
+
)
|
3565
|
+
else:
|
3566
|
+
y_pred_proba_final = None # No probability output for certain models
|
3567
|
+
if not is_binary:
|
3568
|
+
# Handle prediction probabilities for multiclass
|
3569
|
+
if hasattr(stacking_model, "predict_proba"):
|
3570
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3571
|
+
elif hasattr(stacking_model, "decision_function"):
|
3572
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3573
|
+
|
3574
|
+
# Normalize for multiclass if necessary
|
3575
|
+
if y_pred_proba_final.ndim == 2:
|
3576
|
+
y_pred_proba_final = (
|
3577
|
+
y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
|
3578
|
+
) / (
|
3579
|
+
y_pred_proba_final.max(axis=1, keepdims=True)
|
3580
|
+
- y_pred_proba_final.min(axis=1, keepdims=True)
|
3581
|
+
)
|
3582
|
+
else:
|
3583
|
+
y_pred_proba_final = None # No probability output for certain models
|
3584
|
+
#! dict_pred_stack
|
3585
|
+
dict_pred_stack={}
|
3586
|
+
validation_scores_final = {}
|
3587
|
+
if y_true is not None and y_pred_proba_final is not None:
|
3588
|
+
validation_scores_final = cal_metrics(
|
3589
|
+
y_true,
|
3590
|
+
y_pred_final,
|
3591
|
+
y_pred_proba=y_pred_proba_final,
|
3592
|
+
is_binary=is_binary,
|
3593
|
+
purpose=purpose,
|
3594
|
+
average="weighted",
|
3595
|
+
)
|
3596
|
+
if is_binary:
|
3597
|
+
# Calculate ROC curve
|
3598
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
3599
|
+
if y_pred_proba_final is not None:
|
3600
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3601
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3602
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3603
|
+
)
|
3604
|
+
roc_auc = auc(fpr, tpr)
|
3605
|
+
roc_info = {
|
3606
|
+
"fpr": fpr.tolist(),
|
3607
|
+
"tpr": tpr.tolist(),
|
3608
|
+
"auc": roc_auc,
|
3609
|
+
"ci95": (lower_ci, upper_ci),
|
3610
|
+
}
|
3611
|
+
# precision-recall curve
|
3612
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
|
3613
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
|
3614
|
+
pr_info = {
|
3615
|
+
"precision": precision_,
|
3616
|
+
"recall": recall_,
|
3617
|
+
"avg_precision": avg_precision_,
|
3618
|
+
}
|
3619
|
+
else:
|
3620
|
+
roc_info, pr_info = None, None
|
3621
|
+
if purpose == "classification":
|
3622
|
+
dict_pred_stack = {
|
3623
|
+
"best_clf": stacking_model,
|
3624
|
+
"best_params": None,
|
3625
|
+
"auc_indiv": None,
|
3626
|
+
"scores": validation_scores_final,
|
3627
|
+
"roc_curve": roc_info,
|
3628
|
+
"pr_curve": pr_info,
|
3629
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3630
|
+
"predictions": y_pred_final.tolist(),
|
3631
|
+
"predictions_proba": (
|
3632
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
|
+
),
|
3634
|
+
}
|
3635
|
+
else: # "regression"
|
3636
|
+
dict_pred_stack = {
|
3637
|
+
"best_clf": stacking_model,
|
3638
|
+
"best_params": None,
|
3639
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3640
|
+
"predictions": y_pred_final.tolist(),
|
3641
|
+
"predictions_proba": (
|
3642
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
|
+
),
|
3644
|
+
}
|
3645
|
+
else: # multi-classes
|
3646
|
+
if y_pred_proba_final is not None:
|
3647
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3648
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3649
|
+
confidence_intervals = cal_auc_ci(
|
3650
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3651
|
+
)
|
3652
|
+
roc_info = {
|
3653
|
+
"fpr": validation_scores_final["fpr"],
|
3654
|
+
"tpr": validation_scores_final["tpr"],
|
3655
|
+
"auc": validation_scores_final["roc_auc_by_class"],
|
3656
|
+
"ci95": confidence_intervals,
|
3657
|
+
}
|
3658
|
+
# precision-recall curve
|
3659
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3660
|
+
y_true, y_pred_proba_final, is_binary=is_binary
|
3661
|
+
)
|
3662
|
+
pr_info = {
|
3663
|
+
"precision": precision_,
|
3664
|
+
"recall": recall_,
|
3665
|
+
"avg_precision": avg_precision_,
|
3666
|
+
}
|
3667
|
+
else:
|
3668
|
+
roc_info, pr_info = None, None
|
3669
|
+
|
3670
|
+
if purpose == "classification":
|
3671
|
+
dict_pred_stack = {
|
3672
|
+
"best_clf": stacking_model,
|
3673
|
+
"best_params": None,
|
3674
|
+
"auc_indiv": None,
|
3675
|
+
"scores": validation_scores_final,
|
3676
|
+
"roc_curve": roc_info,
|
3677
|
+
"pr_curve": pr_info,
|
3678
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3679
|
+
"predictions": y_pred_final.tolist(),
|
3680
|
+
"predictions_proba": (
|
3681
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
|
+
),
|
3683
|
+
}
|
3684
|
+
else: # "regression"
|
3685
|
+
dict_pred_stack = {
|
3686
|
+
"best_clf": stacking_model,
|
3687
|
+
"best_params": None,
|
3688
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3689
|
+
"predictions": y_pred_final.tolist(),
|
3690
|
+
"predictions_proba": (
|
3691
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
|
+
),
|
3693
|
+
}
|
3694
|
+
|
3695
|
+
else:
|
3696
|
+
if y_true is None:
|
3697
|
+
validation_scores_final = []
|
3698
|
+
else:
|
3699
|
+
validation_scores_final = cal_metrics(
|
3700
|
+
y_true,
|
3701
|
+
y_pred,
|
3702
|
+
y_pred_proba=y_pred_proba_final,
|
3703
|
+
is_binary=is_binary,
|
3704
|
+
purpose=purpose,
|
3705
|
+
average="weighted",
|
3706
|
+
)
|
3707
|
+
dict_pred_stack = {
|
3708
|
+
"best_clf": stacking_model,
|
3709
|
+
"best_params": None,
|
3710
|
+
"scores": validation_scores_final,
|
3711
|
+
"predictions": y_pred_final.tolist(),
|
3712
|
+
"predictions_proba": (
|
3713
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
|
+
),
|
3715
|
+
"y_train": y_train if y_train is not None else [],
|
3716
|
+
"y_true": y_true if y_true is not None else [],
|
3717
|
+
}
|
3718
|
+
# merge together
|
3719
|
+
df_pred = pd.DataFrame(
|
3720
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
|
3721
|
+
for k, v in dict_pred_stack.items():
|
3722
|
+
if k in df_pred.columns:
|
3723
|
+
df_pred[k] = [v]
|
3724
|
+
|
3725
|
+
# # plot the stacking
|
3726
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3727
|
+
# plot_validate_features_single(df_pred, is_binary=is_binary)
|
3728
|
+
# if dir_save:
|
3729
|
+
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
|
+
if vote:
|
3731
|
+
print(f" ⤵ voting...")
|
3732
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
+
#! Votting
|
3734
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
|
+
base_estimators=[]
|
3736
|
+
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
|
+
base_estimators.append((name,cls))
|
3738
|
+
# Apply Voting Classifier/Regressor
|
3739
|
+
if purpose == "classification":
|
3740
|
+
print(f" ⤵ VotingClassifier...via{votting}")
|
3741
|
+
if voting=='hard':
|
3742
|
+
# Hard voting does not support `predict_proba`
|
3743
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3744
|
+
else:
|
3745
|
+
# Soft voting supports `predict_proba`
|
3746
|
+
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
|
+
else:
|
3748
|
+
print(f" ⤵ VotingRegressor...")
|
3749
|
+
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
|
+
|
3751
|
+
# Train the Voting Classifier/Regressor
|
3752
|
+
try:
|
3753
|
+
voting_model.fit(x_train, y_train)
|
3754
|
+
y_pred_vote = voting_model.predict(x_true)
|
3755
|
+
except Exception as e:
|
3756
|
+
if purpose == "classification" and not voting=='hard':
|
3757
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3758
|
+
voting_model.fit(x_train, y_train)
|
3759
|
+
y_pred_vote = voting_model.predict(x_true)
|
3760
|
+
|
3761
|
+
# Calculate predicted probabilities if applicable
|
3762
|
+
if purpose == "classification":
|
3763
|
+
if hasattr(voting_model, "predict_proba"):
|
3764
|
+
y_pred_proba_vote = voting_model.predict_proba(x_true)
|
3765
|
+
print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
|
3766
|
+
if y_pred_proba_vote.shape[1] == 1:
|
3767
|
+
y_pred_proba_vote = np.hstack(
|
3768
|
+
[1 - y_pred_proba_vote, y_pred_proba_vote]
|
3769
|
+
) # Add missing class probabilities
|
3770
|
+
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
|
+
else:
|
3772
|
+
y_pred_proba_vote = None
|
3773
|
+
else: # Regression
|
3774
|
+
y_pred_proba_vote = None
|
3775
|
+
|
3776
|
+
print(f" ⤵ collecting voting results...")
|
3777
|
+
#! dict_pred_vote
|
3778
|
+
dict_pred_vote = {}
|
3779
|
+
validation_scores_vote = {}
|
3780
|
+
if y_true is not None and y_pred_proba_vote is not None:
|
3781
|
+
validation_scores_vote = cal_metrics(
|
3782
|
+
y_true,
|
3783
|
+
y_pred_vote,
|
3784
|
+
y_pred_proba=y_pred_proba_vote,
|
3785
|
+
is_binary=is_binary,
|
3786
|
+
purpose=purpose,
|
3787
|
+
average="weighted",
|
3788
|
+
)
|
3789
|
+
|
3790
|
+
if is_binary:
|
3791
|
+
if y_pred_proba_vote is not None:
|
3792
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
|
3793
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3794
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3795
|
+
)
|
3796
|
+
roc_auc = auc(fpr, tpr)
|
3797
|
+
roc_info = {
|
3798
|
+
"fpr": fpr.tolist(),
|
3799
|
+
"tpr": tpr.tolist(),
|
3800
|
+
"auc": roc_auc,
|
3801
|
+
"ci95": (lower_ci, upper_ci),
|
3802
|
+
}
|
3803
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
|
3804
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
|
3805
|
+
pr_info = {
|
3806
|
+
"precision": precision_,
|
3807
|
+
"recall": recall_,
|
3808
|
+
"avg_precision": avg_precision_,
|
3809
|
+
}
|
3810
|
+
else:
|
3811
|
+
roc_info, pr_info = None, None
|
3812
|
+
|
3813
|
+
dict_pred_vote = {
|
3814
|
+
"best_clf": voting_model,
|
3815
|
+
"best_params": None,
|
3816
|
+
"auc_indiv": None,
|
3817
|
+
"scores": validation_scores_vote,
|
3818
|
+
"roc_curve": roc_info,
|
3819
|
+
"pr_curve": pr_info,
|
3820
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3821
|
+
"predictions": y_pred_vote.tolist(),
|
3822
|
+
"predictions_proba": (
|
3823
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
|
+
),
|
3825
|
+
}
|
3826
|
+
else: # Multi-class
|
3827
|
+
if y_pred_proba_vote is not None:
|
3828
|
+
confidence_intervals = cal_auc_ci(
|
3829
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3830
|
+
)
|
3831
|
+
roc_info = {
|
3832
|
+
"fpr": validation_scores_vote["fpr"],
|
3833
|
+
"tpr": validation_scores_vote["tpr"],
|
3834
|
+
"auc": validation_scores_vote["roc_auc_by_class"],
|
3835
|
+
"ci95": confidence_intervals,
|
3836
|
+
}
|
3837
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3838
|
+
y_true, y_pred_proba_vote, is_binary=is_binary
|
3839
|
+
)
|
3840
|
+
pr_info = {
|
3841
|
+
"precision": precision_,
|
3842
|
+
"recall": recall_,
|
3843
|
+
"avg_precision": avg_precision_,
|
3844
|
+
}
|
3845
|
+
else:
|
3846
|
+
roc_info, pr_info = None, None
|
3847
|
+
|
3848
|
+
dict_pred_vote = {
|
3849
|
+
"best_clf": voting_model,
|
3850
|
+
"best_params": None,
|
3851
|
+
"scores": validation_scores_vote,
|
3852
|
+
"roc_curve": roc_info,
|
3853
|
+
"pr_curve": pr_info,
|
3854
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3855
|
+
"predictions": y_pred_vote.tolist(),
|
3856
|
+
"predictions_proba": (
|
3857
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
|
+
),
|
3859
|
+
}
|
3860
|
+
else:
|
3861
|
+
if y_true is None:
|
3862
|
+
validation_scores_vote = []
|
3863
|
+
else:
|
3864
|
+
validation_scores_vote = cal_metrics(
|
3865
|
+
y_true,
|
3866
|
+
y_pred,
|
3867
|
+
y_pred_proba=y_pred_proba_vote,
|
3868
|
+
is_binary=is_binary,
|
3869
|
+
purpose=purpose,
|
3870
|
+
average="weighted",
|
3871
|
+
)
|
3872
|
+
dict_pred_vote = {
|
3873
|
+
"best_clf": voting_model,
|
3874
|
+
"best_params": None,
|
3875
|
+
"scores": validation_scores_vote,
|
3876
|
+
"predictions": y_pred_vote.tolist(),
|
3877
|
+
"predictions_proba": (
|
3878
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
|
+
),
|
3880
|
+
"y_train": y_train if y_train is not None else [],
|
3881
|
+
"y_true": y_true if y_true is not None else [],
|
3882
|
+
}
|
3883
|
+
df_vote = pd.DataFrame(
|
3884
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
|
3885
|
+
for k, v in dict_pred_vote.items():
|
3886
|
+
if k in df_vote.columns:
|
3887
|
+
df_vote[k] = [v]
|
3888
|
+
|
3889
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3890
|
+
# try:
|
3891
|
+
# plot_validate_features_single(df_vote, is_binary=is_binary)
|
3892
|
+
# if dir_save:
|
3893
|
+
# ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
|
3894
|
+
# except Exception as e:
|
3895
|
+
# print(e)
|
3896
|
+
print("Done")
|
3897
|
+
if vote and stack:
|
3898
|
+
df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
|
3899
|
+
elif vote:
|
3900
|
+
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
3901
|
+
elif stack:
|
3902
|
+
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
3903
|
+
|
3217
3904
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3905
|
+
from datetime import datetime
|
3906
|
+
|
3907
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3218
3908
|
# try:
|
3219
|
-
if
|
3220
|
-
plot_validate_features(
|
3909
|
+
if df_res.shape[0] > 3:
|
3910
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
3221
3911
|
else:
|
3222
|
-
plot_validate_features_single(
|
3912
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
3223
3913
|
if dir_save:
|
3224
3914
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3225
|
-
|
3226
|
-
|
3227
|
-
return
|
3228
|
-
|
3915
|
+
# except Exception as e:
|
3916
|
+
# print(f"Error: 在画图的过程中出现了问题:{e}")
|
3917
|
+
return df_res
|
3229
3918
|
|
3230
3919
|
def cal_metrics(
|
3231
3920
|
y_true,
|
@@ -3367,7 +4056,7 @@ def cal_metrics(
|
|
3367
4056
|
|
3368
4057
|
|
3369
4058
|
def plot_trees(
|
3370
|
-
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
4059
|
+
X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3371
4060
|
):
|
3372
4061
|
"""
|
3373
4062
|
# # Example usage:
|
@@ -3413,10 +4102,14 @@ def plot_trees(
|
|
3413
4102
|
train_error_rate = []
|
3414
4103
|
test_error_rate = []
|
3415
4104
|
validation_error = None
|
3416
|
-
|
4105
|
+
if isinstance(cls, str):
|
4106
|
+
cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
|
3417
4107
|
# Configure classifier based on type
|
3418
4108
|
oob_enabled = False # Default to no OOB error unless explicitly set
|
3419
|
-
|
4109
|
+
clf_support = {"RandomForestClassifier":RandomForestClassifier(),
|
4110
|
+
"ExtraTreesClassifier":ExtraTreesClassifier(),
|
4111
|
+
"AdaBoostClassifier":AdaBoostClassifier(),
|
4112
|
+
"GradientBoostingClassifier":GradientBoostingClassifier()}
|
3420
4113
|
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3421
4114
|
# Enable OOB if cls supports it and is using bootstrapping
|
3422
4115
|
cls.set_params(warm_start=True, n_estimators=1)
|
@@ -3678,7 +4371,7 @@ def img_datasets_preprocessing(
|
|
3678
4371
|
|
3679
4372
|
|
3680
4373
|
def backward_regression(
|
3681
|
-
X: pd.DataFrame, y: pd.Series, initial_list=[],
|
4374
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
|
3682
4375
|
):
|
3683
4376
|
"""
|
3684
4377
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
@@ -3690,31 +4383,46 @@ def backward_regression(
|
|
3690
4383
|
X -- features values
|
3691
4384
|
y -- target variable
|
3692
4385
|
initial_list -- features header
|
3693
|
-
|
4386
|
+
thr -- pvalue threshold of features to drop
|
3694
4387
|
verbose -- true to produce lots of logging output
|
3695
4388
|
|
3696
4389
|
Returns:
|
3697
4390
|
list of selected features for modeling
|
3698
4391
|
"""
|
3699
4392
|
import statsmodels.api as sm
|
3700
|
-
|
3701
|
-
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
4393
|
+
if isinstance(y, str):
|
4394
|
+
if y in X.columns:
|
4395
|
+
y_col_name = y
|
4396
|
+
y = X[y]
|
4397
|
+
X = X.drop(y_col_name, axis=1)
|
4398
|
+
else:
|
4399
|
+
raise ValueError(f"找不到{y},y设置有误")
|
4400
|
+
X = X.select_dtypes(include=[np.number])
|
4401
|
+
|
3705
4402
|
included = list(X.columns)
|
4403
|
+
try:
|
4404
|
+
X=X.astype(float)
|
4405
|
+
y=y.astype(float)
|
4406
|
+
except Exception as e:
|
4407
|
+
raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
|
4408
|
+
|
4409
|
+
|
3706
4410
|
while True:
|
3707
4411
|
changed = False
|
4412
|
+
if not included:
|
4413
|
+
print("No features remain in the model.")
|
4414
|
+
break
|
4415
|
+
|
3708
4416
|
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3709
4417
|
# exclude the intercept for p-value checking
|
3710
4418
|
pvalues = model.pvalues.iloc[1:]
|
3711
4419
|
worst_pval = pvalues.max()
|
3712
|
-
if worst_pval >
|
4420
|
+
if worst_pval > thr:
|
3713
4421
|
changed = True
|
3714
4422
|
worst_feature = pvalues.idxmax()
|
3715
4423
|
included.remove(worst_feature)
|
3716
4424
|
if verbose:
|
3717
|
-
print(f"Removing
|
4425
|
+
print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
|
3718
4426
|
if not changed:
|
3719
4427
|
break
|
3720
4428
|
print(f"\nSelected Features:\n{included}")
|