py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ips.py +213 -195
- py2ls/ml2ls.py +768 -61
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
|
|
5
5
|
BaggingClassifier,
|
6
6
|
)
|
7
7
|
from sklearn.svm import SVC, SVR
|
8
|
-
from sklearn.calibration import CalibratedClassifierCV
|
9
8
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
10
9
|
from sklearn.linear_model import (
|
11
10
|
LassoCV,
|
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
|
|
16
15
|
RidgeClassifierCV,
|
17
16
|
ElasticNet,
|
18
17
|
)
|
19
|
-
|
20
|
-
from sklearn.naive_bayes import GaussianNB
|
21
|
-
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
22
|
-
import xgboost as xgb # Make sure you have xgboost installed
|
23
|
-
|
24
|
-
from sklearn.model_selection import train_test_split, cross_val_score
|
18
|
+
|
25
19
|
from sklearn.metrics import (
|
26
20
|
accuracy_score,
|
27
21
|
precision_score,
|
@@ -36,18 +30,12 @@ from sklearn.metrics import (
|
|
36
30
|
precision_recall_curve,
|
37
31
|
average_precision_score,
|
38
32
|
)
|
39
|
-
from imblearn.over_sampling import SMOTE
|
40
|
-
from sklearn.pipeline import Pipeline
|
41
|
-
from collections import defaultdict
|
42
|
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
43
33
|
from typing import Dict, Any, Optional, List, Union
|
44
34
|
import numpy as np
|
45
35
|
import pandas as pd
|
46
36
|
from . import ips
|
47
37
|
from . import plot
|
48
38
|
import matplotlib.pyplot as plt
|
49
|
-
import seaborn as sns
|
50
|
-
|
51
39
|
plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
|
52
40
|
import logging
|
53
41
|
import warnings
|
@@ -314,6 +302,8 @@ def features_svm(
|
|
314
302
|
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
315
303
|
S-shaped relationships.
|
316
304
|
"""
|
305
|
+
from sklearn.feature_selection import RFE
|
306
|
+
from sklearn.svm import SVC
|
317
307
|
# SVM (Support Vector Machines)
|
318
308
|
svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
|
319
309
|
# RFE(Recursive Feature Elimination)
|
@@ -450,6 +440,7 @@ def validate_classifier(
|
|
450
440
|
Returns:
|
451
441
|
- results: Dictionary containing average cv_train_scores and cv_test_scores.
|
452
442
|
"""
|
443
|
+
from sklearn.model_selection import cross_val_score
|
453
444
|
cv_train_scores = {metric: [] for metric in metrics}
|
454
445
|
skf = StratifiedKFold(n_splits=cv_folds)
|
455
446
|
# Perform cross-validation
|
@@ -982,6 +973,8 @@ def validate_features(
|
|
982
973
|
|
983
974
|
"""
|
984
975
|
from tqdm import tqdm
|
976
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
977
|
+
from sklearn.calibration import CalibratedClassifierCV
|
985
978
|
|
986
979
|
# Ensure common features are selected
|
987
980
|
common_features = ips.shared(
|
@@ -1001,6 +994,7 @@ def validate_features(
|
|
1001
994
|
|
1002
995
|
# Handle class imbalance using SMOTE
|
1003
996
|
if smote:
|
997
|
+
from imblearn.over_sampling import SMOTE
|
1004
998
|
if (
|
1005
999
|
y_train.value_counts(normalize=True).max() < 0.8
|
1006
1000
|
): # Threshold to decide if data is imbalanced
|
@@ -2096,7 +2090,116 @@ def rank_models(
|
|
2096
2090
|
# )
|
2097
2091
|
|
2098
2092
|
# figsave("classifier_performance.pdf")
|
2093
|
+
def rank_models_reg(df, ascending=False):
|
2094
|
+
"""
|
2095
|
+
Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
|
2096
|
+
|
2097
|
+
Parameters:
|
2098
|
+
df (pd.DataFrame): DataFrame containing the regression metrics.
|
2099
|
+
ascending (bool): Whether to sort in ascending order of ranking score.
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
|
2103
|
+
"""
|
2104
|
+
# Define weights for the 4 metrics
|
2105
|
+
weights = {
|
2106
|
+
"mse": -1, # Lower is better
|
2107
|
+
"rmse": -1, # Lower is better
|
2108
|
+
"mae": -1, # Lower is better
|
2109
|
+
"r2": 1, # Higher is better
|
2110
|
+
}
|
2111
|
+
|
2112
|
+
# Normalize the selected metrics
|
2113
|
+
df = df.copy() # Work on a copy of the DataFrame
|
2114
|
+
for metric, weight in weights.items():
|
2115
|
+
if metric in df.columns:
|
2116
|
+
if weight > 0: # Higher is better; normalize 0-1
|
2117
|
+
df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
|
2118
|
+
df[metric].max() - df[metric].min()
|
2119
|
+
)
|
2120
|
+
else: # Lower is better; reverse normalize 0-1
|
2121
|
+
df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
|
2122
|
+
df[metric].max() - df[metric].min()
|
2123
|
+
)
|
2099
2124
|
|
2125
|
+
# Calculate ranking score as a weighted sum
|
2126
|
+
df["Ranking_Score"] = sum(
|
2127
|
+
df[metric + "_normalized"] * abs(weights[metric])
|
2128
|
+
for metric in weights.keys()
|
2129
|
+
if metric + "_normalized" in df.columns
|
2130
|
+
)
|
2131
|
+
|
2132
|
+
# Sort models based on the ranking score
|
2133
|
+
sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
|
2134
|
+
return sorted_df
|
2135
|
+
|
2136
|
+
models_support = {
|
2137
|
+
"classification": {
|
2138
|
+
"Random Forest": "Tree-Based",
|
2139
|
+
"SVM": "Kernel-Based",
|
2140
|
+
"Logistic Regression": "Linear",
|
2141
|
+
"Lasso Logistic Regression": "Linear",
|
2142
|
+
"Gradient Boosting": "Tree-Based",
|
2143
|
+
"XGBoost": "Tree-Based",
|
2144
|
+
"KNN": "Instance-Based",
|
2145
|
+
"Naive Bayes": "Probabilistic",
|
2146
|
+
"Linear Discriminant Analysis": "Linear",
|
2147
|
+
"AdaBoost": "Tree-Based",
|
2148
|
+
"CatBoost": "Tree-Based",
|
2149
|
+
"Extra Trees": "Tree-Based",
|
2150
|
+
"Bagging": "Tree-Based",
|
2151
|
+
"Neural Network": "Neural Network",
|
2152
|
+
"DecisionTree": "Tree-Based",
|
2153
|
+
"Quadratic Discriminant Analysis": "Probabilistic",
|
2154
|
+
"Ridge": "Linear",
|
2155
|
+
"Perceptron": "Linear",
|
2156
|
+
"Bernoulli Naive Bayes": "Probabilistic",
|
2157
|
+
"SGDClassifier": "Linear",
|
2158
|
+
},
|
2159
|
+
"regression": {
|
2160
|
+
"Linear Regression": "Linear",
|
2161
|
+
"Ridge": "Linear",
|
2162
|
+
"RidgeCV": "Linear",
|
2163
|
+
"TheilSenRegressor": "Linear",
|
2164
|
+
"HuberRegressor": "Linear",
|
2165
|
+
"PoissonRegressor": "Linear",
|
2166
|
+
"LassoCV": "Linear",
|
2167
|
+
"Bagging": "Tree-Based",
|
2168
|
+
"ElasticNet": "Linear",
|
2169
|
+
"Random Forest": "Tree-Based",
|
2170
|
+
"Gradient Boosting": "Tree-Based",
|
2171
|
+
"XGBoost": "Tree-Based",
|
2172
|
+
"CatBoost": "Tree-Based",
|
2173
|
+
"Extra Trees": "Tree-Based",
|
2174
|
+
"SVM": "Kernel-Based",
|
2175
|
+
"KNN": "Instance-Based",
|
2176
|
+
"Neural Network": "Neural Network",
|
2177
|
+
"AdaBoost": "Linear",
|
2178
|
+
},
|
2179
|
+
}
|
2180
|
+
def select_top_models(models, categories, n_top_models, n_models_per_category=1):
|
2181
|
+
"""
|
2182
|
+
models = list_sort
|
2183
|
+
purpose = "regression"
|
2184
|
+
categories = models_support[purpose]
|
2185
|
+
n_top_models = 3
|
2186
|
+
select_top_models(models, categories, n_top_models)
|
2187
|
+
"""
|
2188
|
+
selected = {}
|
2189
|
+
result = []
|
2190
|
+
for model in models:
|
2191
|
+
category = categories.get(model, "Unknown")
|
2192
|
+
if category not in selected:
|
2193
|
+
selected[category] = 0 # Initialize counter for the category
|
2194
|
+
|
2195
|
+
if selected[category] < n_models_per_category: # Allow additional models up to the limit
|
2196
|
+
selected[category] += 1
|
2197
|
+
result.append(model)
|
2198
|
+
|
2199
|
+
if len(result) == n_top_models: # Stop when the desired number of models is reached
|
2200
|
+
break
|
2201
|
+
|
2202
|
+
return result
|
2100
2203
|
|
2101
2204
|
def predict(
|
2102
2205
|
x_train: pd.DataFrame,
|
@@ -2104,11 +2207,17 @@ def predict(
|
|
2104
2207
|
x_true: pd.DataFrame = None,
|
2105
2208
|
y_true: Optional[pd.Series] = None,
|
2106
2209
|
backward: bool = False, # backward_regression
|
2210
|
+
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2107
2211
|
common_features: set = None,
|
2108
2212
|
purpose: str = "classification", # 'classification' or 'regression'
|
2109
2213
|
cls: Optional[Dict[str, Any]] = None,
|
2110
2214
|
metrics: Optional[List[str]] = None,
|
2111
|
-
|
2215
|
+
stack:bool=True,# run stacking
|
2216
|
+
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2217
|
+
vote:bool=True,# run voting
|
2218
|
+
voting:str="hard", # only for classification purporse of voting
|
2219
|
+
n_top_models:int=5, #for stacking models
|
2220
|
+
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
2112
2221
|
smote: bool = False,
|
2113
2222
|
n_jobs: int = -1,
|
2114
2223
|
plot_: bool = True,
|
@@ -2117,6 +2226,7 @@ def predict(
|
|
2117
2226
|
cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
|
2118
2227
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2119
2228
|
class_weight: str = "balanced",
|
2229
|
+
random_state: int = 1,
|
2120
2230
|
verbose: bool = False,
|
2121
2231
|
) -> pd.DataFrame:
|
2122
2232
|
"""
|
@@ -2184,10 +2294,17 @@ def predict(
|
|
2184
2294
|
RidgeClassifierCV,
|
2185
2295
|
Perceptron,
|
2186
2296
|
SGDClassifier,
|
2297
|
+
RidgeCV,
|
2298
|
+
Ridge,
|
2299
|
+
TheilSenRegressor,
|
2300
|
+
HuberRegressor,
|
2301
|
+
PoissonRegressor,
|
2302
|
+
|
2187
2303
|
)
|
2304
|
+
from sklearn.compose import TransformedTargetRegressor
|
2188
2305
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
2189
2306
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
2190
|
-
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
2307
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
|
2191
2308
|
import xgboost as xgb
|
2192
2309
|
import lightgbm as lgb
|
2193
2310
|
import catboost as cb
|
@@ -2198,6 +2315,7 @@ def predict(
|
|
2198
2315
|
QuadraticDiscriminantAnalysis,
|
2199
2316
|
)
|
2200
2317
|
from sklearn.preprocessing import PolynomialFeatures
|
2318
|
+
from sklearn.model_selection import train_test_split
|
2201
2319
|
|
2202
2320
|
# 拼写检查
|
2203
2321
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
@@ -2261,7 +2379,6 @@ def predict(
|
|
2261
2379
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2262
2380
|
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2263
2381
|
"Linear Regression": LinearRegression(),
|
2264
|
-
"Lasso": Lasso(random_state=random_state),
|
2265
2382
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2266
2383
|
# "LightGBM": lgb.LGBMRegressor(random_state=random_state),
|
2267
2384
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
@@ -2271,10 +2388,10 @@ def predict(
|
|
2271
2388
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2272
2389
|
"Ridge": Ridge(),
|
2273
2390
|
"KNN": KNeighborsRegressor(),
|
2391
|
+
"TheilSen":TheilSenRegressor(),
|
2392
|
+
"Huber":HuberRegressor(),
|
2393
|
+
"Poisson":PoissonRegressor()
|
2274
2394
|
}
|
2275
|
-
# indicate cls:
|
2276
|
-
if ips.run_once_within(30): # 10 min
|
2277
|
-
print(f"supported models: {list(model_.keys())}")
|
2278
2395
|
if cls is None:
|
2279
2396
|
models = model_
|
2280
2397
|
else:
|
@@ -2290,6 +2407,10 @@ def predict(
|
|
2290
2407
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2291
2408
|
)
|
2292
2409
|
|
2410
|
+
# indicate cls:
|
2411
|
+
if ips.run_once_within(30): # 10 min
|
2412
|
+
print(f"processing: {list(models.keys())}")
|
2413
|
+
|
2293
2414
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2294
2415
|
y_train_col_name = y_train
|
2295
2416
|
y_train = x_train[y_train]
|
@@ -2311,7 +2432,7 @@ def predict(
|
|
2311
2432
|
|
2312
2433
|
# Perform backward feature selection
|
2313
2434
|
if backward:
|
2314
|
-
selected_features = backward_regression(x_train, y_train,
|
2435
|
+
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
2315
2436
|
x_train = x_train[selected_features]
|
2316
2437
|
|
2317
2438
|
if x_true is None:
|
@@ -2396,6 +2517,17 @@ def predict(
|
|
2396
2517
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2397
2518
|
y_true = np.asarray(y_true)
|
2398
2519
|
# Hyperparameter grids for tuning
|
2520
|
+
param_grid_common_xgb = {
|
2521
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2522
|
+
'max_depth': [3, 5, 7, 10],
|
2523
|
+
'n_estimators': [50, 100, 200, 300],
|
2524
|
+
'subsample': [0.6, 0.8, 1.0],
|
2525
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2526
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2527
|
+
'min_child_weight': [1, 5, 10],
|
2528
|
+
'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
|
2529
|
+
'reg_lambda': [1, 1.5, 2], # L2 regularization term
|
2530
|
+
}
|
2399
2531
|
if cv_level in ["low", "simple", "s", "l"]:
|
2400
2532
|
param_grids = {
|
2401
2533
|
"Random Forest": (
|
@@ -2440,12 +2572,17 @@ def predict(
|
|
2440
2572
|
"min_samples_split": [2],
|
2441
2573
|
"subsample": [0.8],
|
2442
2574
|
},
|
2443
|
-
"XGBoost":
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2447
|
-
|
2448
|
-
|
2575
|
+
"XGBoost":{
|
2576
|
+
'learning_rate': [0.01],
|
2577
|
+
'max_depth': [3],
|
2578
|
+
'n_estimators': [50],
|
2579
|
+
'subsample': [0.6],
|
2580
|
+
'colsample_bytree': [0.6],
|
2581
|
+
'gamma': [0, 0.1],
|
2582
|
+
'min_child_weight': [1],
|
2583
|
+
'reg_alpha': [0, 0.1],
|
2584
|
+
'reg_lambda': [1],
|
2585
|
+
'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
|
2449
2586
|
},
|
2450
2587
|
"KNN": (
|
2451
2588
|
{
|
@@ -2552,6 +2689,14 @@ def predict(
|
|
2552
2689
|
"random_state": [random_state],
|
2553
2690
|
"learning_rate": ["constant"],
|
2554
2691
|
},
|
2692
|
+
"TheilSen":{'max_iter': [100],
|
2693
|
+
'tol': [1e-4],
|
2694
|
+
'n_subsamples': [100+x_train.shape[1]]},
|
2695
|
+
"Huber":{'epsilon': [1.35],
|
2696
|
+
'alpha': [0.1],
|
2697
|
+
'max_iter': [100],},
|
2698
|
+
"Poisson":{'alpha': [0.1],
|
2699
|
+
'max_iter': [100],}
|
2555
2700
|
}
|
2556
2701
|
elif cv_level in ["high", "advanced", "h"]:
|
2557
2702
|
param_grids = {
|
@@ -2613,12 +2758,30 @@ def predict(
|
|
2613
2758
|
"subsample": [0.8, 1.0],
|
2614
2759
|
},
|
2615
2760
|
"XGBoost": {
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2761
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2762
|
+
'max_depth': [3, 5, 7, 10],
|
2763
|
+
'n_estimators': [50, 100, 200, 300],
|
2764
|
+
'subsample': [0.6, 0.8, 1.0],
|
2765
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2766
|
+
'min_child_weight': [1, 5, 10],
|
2767
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2768
|
+
'reg_lambda': [1, 1.5, 2],
|
2769
|
+
**{
|
2770
|
+
'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
|
2771
|
+
}} if purpose== "classification"
|
2772
|
+
else{
|
2773
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2774
|
+
'max_depth': [3, 5, 7, 10],
|
2775
|
+
'n_estimators': [50, 100, 200, 300],
|
2776
|
+
'subsample': [0.6, 0.8, 1.0],
|
2777
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2778
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2779
|
+
'min_child_weight': [1, 5, 10],
|
2780
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2781
|
+
'reg_lambda': [1, 1.5, 2],
|
2782
|
+
**{
|
2783
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
|
2784
|
+
}},
|
2622
2785
|
"KNN": (
|
2623
2786
|
{
|
2624
2787
|
"n_neighbors": [1, 3, 5, 10, 15, 20],
|
@@ -2731,6 +2894,14 @@ def predict(
|
|
2731
2894
|
], # If True, the regressors X will be normalized
|
2732
2895
|
}
|
2733
2896
|
),
|
2897
|
+
"TheilSen":{'max_iter': [100, 200, 300],
|
2898
|
+
'tol': [1e-4, 1e-3, 1e-2],
|
2899
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
|
2900
|
+
"Huber":{'epsilon': [1.35, 1.5, 2.0],
|
2901
|
+
'alpha': [0.1, 1.0, 10.0],
|
2902
|
+
'max_iter': [100, 200, 300],},
|
2903
|
+
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2904
|
+
'max_iter': [100, 200, 300],}
|
2734
2905
|
}
|
2735
2906
|
else: # median level
|
2736
2907
|
param_grids = {
|
@@ -2790,12 +2961,30 @@ def predict(
|
|
2790
2961
|
"subsample": [0.8, 1.0],
|
2791
2962
|
},
|
2792
2963
|
"XGBoost": {
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
|
2797
|
-
|
2798
|
-
|
2964
|
+
'learning_rate': [0.01, 0.1],
|
2965
|
+
'max_depth': [3, 5],
|
2966
|
+
'n_estimators': [50, 100],
|
2967
|
+
'subsample': [0.6, 0.8],
|
2968
|
+
'gamma': [0, 0.1],
|
2969
|
+
'min_child_weight': [1, 5],
|
2970
|
+
'reg_alpha': [0, 0.1],
|
2971
|
+
'reg_lambda': [1,],
|
2972
|
+
**{
|
2973
|
+
'objective': ['binary:logistic', 'multi:softmax'],
|
2974
|
+
}} if purpose== "classification"
|
2975
|
+
else{
|
2976
|
+
'learning_rate': [0.01, 0.1],
|
2977
|
+
'max_depth': [3, 5,],
|
2978
|
+
'n_estimators': [50, 100],
|
2979
|
+
'subsample': [0.6, 0.8],
|
2980
|
+
'colsample_bytree': [0.6, 0.8],
|
2981
|
+
'gamma': [0, 0.1],
|
2982
|
+
'min_child_weight': [1, 5],
|
2983
|
+
'reg_alpha': [0, 0.1],
|
2984
|
+
'reg_lambda': [1, 1.5],
|
2985
|
+
**{
|
2986
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror'],
|
2987
|
+
}},
|
2799
2988
|
"KNN": (
|
2800
2989
|
{
|
2801
2990
|
"n_neighbors": [3, 5, 7, 10],
|
@@ -2952,6 +3141,14 @@ def predict(
|
|
2952
3141
|
], # Solver for optimization
|
2953
3142
|
}
|
2954
3143
|
),
|
3144
|
+
"TheilSen":{'max_iter': [100, 200],
|
3145
|
+
'tol': [1e-4, 1e-3],
|
3146
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
|
3147
|
+
"Huber":{'epsilon': [1.35, 1.5],
|
3148
|
+
'alpha': [0.1, 1.0],
|
3149
|
+
'max_iter': [100, 200],},
|
3150
|
+
"Poisson":{'alpha': [0.1, 1.0],
|
3151
|
+
'max_iter': [100, 200],}
|
2955
3152
|
}
|
2956
3153
|
|
2957
3154
|
results = {}
|
@@ -3192,12 +3389,18 @@ def predict(
|
|
3192
3389
|
# Convert results to DataFrame
|
3193
3390
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3194
3391
|
# sort
|
3195
|
-
if y_true is not None
|
3196
|
-
|
3197
|
-
|
3198
|
-
|
3392
|
+
if y_true is not None:
|
3393
|
+
if purpose == "classification":
|
3394
|
+
df_scores = pd.DataFrame(
|
3395
|
+
df_results["scores"].tolist(), index=df_results["scores"].index
|
3396
|
+
).sort_values(by="roc_auc", ascending=False)
|
3397
|
+
elif purpose=='regression':
|
3398
|
+
df_scores = rank_models_reg(
|
3399
|
+
pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
|
3400
|
+
ascending=False)
|
3199
3401
|
df_results = df_results.loc[df_scores.index]
|
3200
3402
|
|
3403
|
+
if y_true is not None and purpose == "classification":
|
3201
3404
|
if plot_:
|
3202
3405
|
from datetime import datetime
|
3203
3406
|
|
@@ -3215,18 +3418,503 @@ def predict(
|
|
3215
3418
|
plot.figsets(xangle=30)
|
3216
3419
|
if dir_save:
|
3217
3420
|
ips.figsave(dir_save + f"scores_clus{now_}.pdf")
|
3421
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3422
|
+
# # try:
|
3423
|
+
# if len(models) > 3:
|
3424
|
+
# plot_validate_features(df_results, is_binary=is_binary)
|
3425
|
+
# else:
|
3426
|
+
# plot_validate_features_single(df_results, is_binary=is_binary)
|
3427
|
+
# if dir_save:
|
3428
|
+
# ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3429
|
+
# # except Exception as e:
|
3430
|
+
# # print(f"Error: 在画图的过程中出现了问题:{e}")
|
3431
|
+
if stack:
|
3432
|
+
#! stacking classifier/regressor
|
3433
|
+
from sklearn.metrics import make_scorer, accuracy_score
|
3434
|
+
from sklearn.model_selection import cross_val_score
|
3435
|
+
|
3436
|
+
#* n_top_models防止超过index
|
3437
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3438
|
+
|
3439
|
+
#* 选择出排名靠前的n个, estimators
|
3440
|
+
models_selecte = select_top_models(models=list(df_results.index),
|
3441
|
+
categories=models_support[purpose],
|
3442
|
+
n_top_models=n_top_models,
|
3443
|
+
n_models_per_category=n_models_per_category)
|
3444
|
+
top_models = df_results.loc[models_selecte]["best_clf"]
|
3445
|
+
base_estimators = []
|
3446
|
+
for i, j in top_models.to_dict().items():
|
3447
|
+
base_estimators.append((i, j))
|
3448
|
+
if stacking_cv:
|
3449
|
+
print(f" ⤵ stacking_cv is processing...")
|
3450
|
+
#* 定义几个象征性的final_estimator
|
3451
|
+
# 备选的几种
|
3452
|
+
if purpose == "classification":
|
3453
|
+
kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
|
3454
|
+
else:
|
3455
|
+
kadt_estimators=["XGBoost","LassoCV"]
|
3456
|
+
final_estimators={}
|
3457
|
+
for name in kadt_estimators:
|
3458
|
+
param_grid=param_grids.get(name, {})
|
3459
|
+
print(param_grid)
|
3460
|
+
if is_binary:
|
3461
|
+
gs = GridSearchCV(
|
3462
|
+
model_[name],
|
3463
|
+
param_grid=param_grid,
|
3464
|
+
scoring=(
|
3465
|
+
"roc_auc"
|
3466
|
+
if purpose == "classification"
|
3467
|
+
else "neg_mean_squared_error"
|
3468
|
+
),
|
3469
|
+
cv=cv,
|
3470
|
+
n_jobs=n_jobs,
|
3471
|
+
verbose=verbose,
|
3472
|
+
)
|
3473
|
+
else:
|
3474
|
+
gs = GridSearchCV(
|
3475
|
+
model_[name],
|
3476
|
+
param_grid=param_grid,
|
3477
|
+
scoring=(
|
3478
|
+
"roc_auc_ovr"
|
3479
|
+
if purpose == "classification"
|
3480
|
+
else "neg_mean_squared_error"
|
3481
|
+
),
|
3482
|
+
cv=cv,
|
3483
|
+
n_jobs=n_jobs,
|
3484
|
+
verbose=verbose,
|
3485
|
+
)
|
3486
|
+
# Fit GridSearchCV
|
3487
|
+
gs.fit(x_train, y_train)
|
3488
|
+
final_estimators[name]=gs.best_estimator_
|
3489
|
+
|
3490
|
+
#* Set up cross-validation and performance evaluation
|
3491
|
+
scorer = make_scorer(accuracy_score)
|
3492
|
+
cv_results = []
|
3493
|
+
|
3494
|
+
#*Cross-validate stacking models with different final estimators
|
3495
|
+
for final_name, final_estimator in final_estimators.items():
|
3496
|
+
print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
|
3497
|
+
if purpose == "classification":
|
3498
|
+
stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
|
3499
|
+
else:
|
3500
|
+
stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
|
3501
|
+
|
3502
|
+
scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
|
3503
|
+
|
3504
|
+
# Store the result
|
3505
|
+
cv_results.append({
|
3506
|
+
'final_estimator':final_estimator,
|
3507
|
+
'Final Estimator': final_name,
|
3508
|
+
'Mean Accuracy': np.mean(scores),
|
3509
|
+
'Standard Deviation': np.std(scores)
|
3510
|
+
})
|
3511
|
+
|
3512
|
+
#* Convert the results into a DataFrame for easy comparison
|
3513
|
+
cv_results_df = pd.DataFrame(cv_results)
|
3514
|
+
|
3515
|
+
#* Sort and display the best model
|
3516
|
+
cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
|
3517
|
+
|
3518
|
+
|
3519
|
+
# Optionally: Select the final estimator that gives the best performance
|
3520
|
+
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
|
+
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
|
+
else:
|
3523
|
+
print(f" ⤵ trying to find the best_final_estimator for stacking...")
|
3524
|
+
if purpose=="classification":
|
3525
|
+
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
|
+
random_state=random_state,
|
3527
|
+
max_iter=1000)
|
3528
|
+
else:
|
3529
|
+
best_final_estimator = RidgeCV(cv=5)
|
3530
|
+
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
|
+
#! apply stacking
|
3532
|
+
if purpose == "classification":
|
3533
|
+
print(f" ⤵ StackingClassifier...")
|
3534
|
+
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
|
+
final_estimator=best_final_estimator,
|
3536
|
+
cv=cv)
|
3537
|
+
else:
|
3538
|
+
print(f" ⤵ StackingRegressor...")
|
3539
|
+
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
|
+
final_estimator=best_final_estimator,
|
3541
|
+
cv=cv)
|
3542
|
+
|
3543
|
+
# Train the Stacking Classifier
|
3544
|
+
print(f" ⤵ fit & predict...")
|
3545
|
+
stacking_model.fit(x_train, y_train)
|
3546
|
+
y_pred_final = stacking_model.predict(x_true)
|
3547
|
+
print(f" ⤵ collecting results...")
|
3548
|
+
# pred_proba
|
3549
|
+
if is_binary:
|
3550
|
+
if hasattr(stacking_model, "predict_proba"):
|
3551
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
+
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
|
+
if y_pred_proba_final.shape[1] == 1:
|
3554
|
+
y_pred_proba_final = np.hstack(
|
3555
|
+
[1 - y_pred_proba_final, y_pred_proba_final]
|
3556
|
+
) # Add missing class probabilities
|
3557
|
+
y_pred_proba_final = y_pred_proba_final[:, 1]
|
3558
|
+
elif hasattr(stacking_model, "decision_function"):
|
3559
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3560
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3561
|
+
# Ensure y_pred_proba_final is within 0 and 1 bounds
|
3562
|
+
y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
|
3563
|
+
y_pred_proba_final.max() - y_pred_proba_final.min()
|
3564
|
+
)
|
3565
|
+
else:
|
3566
|
+
y_pred_proba_final = None # No probability output for certain models
|
3567
|
+
if not is_binary:
|
3568
|
+
# Handle prediction probabilities for multiclass
|
3569
|
+
if hasattr(stacking_model, "predict_proba"):
|
3570
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3571
|
+
elif hasattr(stacking_model, "decision_function"):
|
3572
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3573
|
+
|
3574
|
+
# Normalize for multiclass if necessary
|
3575
|
+
if y_pred_proba_final.ndim == 2:
|
3576
|
+
y_pred_proba_final = (
|
3577
|
+
y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
|
3578
|
+
) / (
|
3579
|
+
y_pred_proba_final.max(axis=1, keepdims=True)
|
3580
|
+
- y_pred_proba_final.min(axis=1, keepdims=True)
|
3581
|
+
)
|
3582
|
+
else:
|
3583
|
+
y_pred_proba_final = None # No probability output for certain models
|
3584
|
+
#! dict_pred_stack
|
3585
|
+
dict_pred_stack={}
|
3586
|
+
validation_scores_final = {}
|
3587
|
+
if y_true is not None and y_pred_proba_final is not None:
|
3588
|
+
validation_scores_final = cal_metrics(
|
3589
|
+
y_true,
|
3590
|
+
y_pred_final,
|
3591
|
+
y_pred_proba=y_pred_proba_final,
|
3592
|
+
is_binary=is_binary,
|
3593
|
+
purpose=purpose,
|
3594
|
+
average="weighted",
|
3595
|
+
)
|
3596
|
+
if is_binary:
|
3597
|
+
# Calculate ROC curve
|
3598
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
3599
|
+
if y_pred_proba_final is not None:
|
3600
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3601
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3602
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3603
|
+
)
|
3604
|
+
roc_auc = auc(fpr, tpr)
|
3605
|
+
roc_info = {
|
3606
|
+
"fpr": fpr.tolist(),
|
3607
|
+
"tpr": tpr.tolist(),
|
3608
|
+
"auc": roc_auc,
|
3609
|
+
"ci95": (lower_ci, upper_ci),
|
3610
|
+
}
|
3611
|
+
# precision-recall curve
|
3612
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
|
3613
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
|
3614
|
+
pr_info = {
|
3615
|
+
"precision": precision_,
|
3616
|
+
"recall": recall_,
|
3617
|
+
"avg_precision": avg_precision_,
|
3618
|
+
}
|
3619
|
+
else:
|
3620
|
+
roc_info, pr_info = None, None
|
3621
|
+
if purpose == "classification":
|
3622
|
+
dict_pred_stack = {
|
3623
|
+
"best_clf": stacking_model,
|
3624
|
+
"best_params": None,
|
3625
|
+
"auc_indiv": None,
|
3626
|
+
"scores": validation_scores_final,
|
3627
|
+
"roc_curve": roc_info,
|
3628
|
+
"pr_curve": pr_info,
|
3629
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3630
|
+
"predictions": y_pred_final.tolist(),
|
3631
|
+
"predictions_proba": (
|
3632
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
|
+
),
|
3634
|
+
}
|
3635
|
+
else: # "regression"
|
3636
|
+
dict_pred_stack = {
|
3637
|
+
"best_clf": stacking_model,
|
3638
|
+
"best_params": None,
|
3639
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3640
|
+
"predictions": y_pred_final.tolist(),
|
3641
|
+
"predictions_proba": (
|
3642
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
|
+
),
|
3644
|
+
}
|
3645
|
+
else: # multi-classes
|
3646
|
+
if y_pred_proba_final is not None:
|
3647
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3648
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3649
|
+
confidence_intervals = cal_auc_ci(
|
3650
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3651
|
+
)
|
3652
|
+
roc_info = {
|
3653
|
+
"fpr": validation_scores_final["fpr"],
|
3654
|
+
"tpr": validation_scores_final["tpr"],
|
3655
|
+
"auc": validation_scores_final["roc_auc_by_class"],
|
3656
|
+
"ci95": confidence_intervals,
|
3657
|
+
}
|
3658
|
+
# precision-recall curve
|
3659
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3660
|
+
y_true, y_pred_proba_final, is_binary=is_binary
|
3661
|
+
)
|
3662
|
+
pr_info = {
|
3663
|
+
"precision": precision_,
|
3664
|
+
"recall": recall_,
|
3665
|
+
"avg_precision": avg_precision_,
|
3666
|
+
}
|
3667
|
+
else:
|
3668
|
+
roc_info, pr_info = None, None
|
3669
|
+
|
3670
|
+
if purpose == "classification":
|
3671
|
+
dict_pred_stack = {
|
3672
|
+
"best_clf": stacking_model,
|
3673
|
+
"best_params": None,
|
3674
|
+
"auc_indiv": None,
|
3675
|
+
"scores": validation_scores_final,
|
3676
|
+
"roc_curve": roc_info,
|
3677
|
+
"pr_curve": pr_info,
|
3678
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3679
|
+
"predictions": y_pred_final.tolist(),
|
3680
|
+
"predictions_proba": (
|
3681
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
|
+
),
|
3683
|
+
}
|
3684
|
+
else: # "regression"
|
3685
|
+
dict_pred_stack = {
|
3686
|
+
"best_clf": stacking_model,
|
3687
|
+
"best_params": None,
|
3688
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3689
|
+
"predictions": y_pred_final.tolist(),
|
3690
|
+
"predictions_proba": (
|
3691
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
|
+
),
|
3693
|
+
}
|
3694
|
+
|
3695
|
+
else:
|
3696
|
+
if y_true is None:
|
3697
|
+
validation_scores_final = []
|
3698
|
+
else:
|
3699
|
+
validation_scores_final = cal_metrics(
|
3700
|
+
y_true,
|
3701
|
+
y_pred,
|
3702
|
+
y_pred_proba=y_pred_proba_final,
|
3703
|
+
is_binary=is_binary,
|
3704
|
+
purpose=purpose,
|
3705
|
+
average="weighted",
|
3706
|
+
)
|
3707
|
+
dict_pred_stack = {
|
3708
|
+
"best_clf": stacking_model,
|
3709
|
+
"best_params": None,
|
3710
|
+
"scores": validation_scores_final,
|
3711
|
+
"predictions": y_pred_final.tolist(),
|
3712
|
+
"predictions_proba": (
|
3713
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
|
+
),
|
3715
|
+
"y_train": y_train if y_train is not None else [],
|
3716
|
+
"y_true": y_true if y_true is not None else [],
|
3717
|
+
}
|
3718
|
+
# merge together
|
3719
|
+
df_pred = pd.DataFrame(
|
3720
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
|
3721
|
+
for k, v in dict_pred_stack.items():
|
3722
|
+
if k in df_pred.columns:
|
3723
|
+
df_pred[k] = [v]
|
3724
|
+
|
3725
|
+
# # plot the stacking
|
3726
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3727
|
+
# plot_validate_features_single(df_pred, is_binary=is_binary)
|
3728
|
+
# if dir_save:
|
3729
|
+
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
|
+
if vote:
|
3731
|
+
print(f" ⤵ voting...")
|
3732
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
+
#! Votting
|
3734
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
|
+
base_estimators=[]
|
3736
|
+
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
|
+
base_estimators.append((name,cls))
|
3738
|
+
# Apply Voting Classifier/Regressor
|
3739
|
+
if purpose == "classification":
|
3740
|
+
print(f" ⤵ VotingClassifier...via{votting}")
|
3741
|
+
if voting=='hard':
|
3742
|
+
# Hard voting does not support `predict_proba`
|
3743
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3744
|
+
else:
|
3745
|
+
# Soft voting supports `predict_proba`
|
3746
|
+
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
|
+
else:
|
3748
|
+
print(f" ⤵ VotingRegressor...")
|
3749
|
+
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
|
+
|
3751
|
+
# Train the Voting Classifier/Regressor
|
3752
|
+
try:
|
3753
|
+
voting_model.fit(x_train, y_train)
|
3754
|
+
y_pred_vote = voting_model.predict(x_true)
|
3755
|
+
except Exception as e:
|
3756
|
+
if purpose == "classification" and not voting=='hard':
|
3757
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3758
|
+
voting_model.fit(x_train, y_train)
|
3759
|
+
y_pred_vote = voting_model.predict(x_true)
|
3760
|
+
|
3761
|
+
# Calculate predicted probabilities if applicable
|
3762
|
+
if purpose == "classification":
|
3763
|
+
if hasattr(voting_model, "predict_proba"):
|
3764
|
+
y_pred_proba_vote = voting_model.predict_proba(x_true)
|
3765
|
+
print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
|
3766
|
+
if y_pred_proba_vote.shape[1] == 1:
|
3767
|
+
y_pred_proba_vote = np.hstack(
|
3768
|
+
[1 - y_pred_proba_vote, y_pred_proba_vote]
|
3769
|
+
) # Add missing class probabilities
|
3770
|
+
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
|
+
else:
|
3772
|
+
y_pred_proba_vote = None
|
3773
|
+
else: # Regression
|
3774
|
+
y_pred_proba_vote = None
|
3775
|
+
|
3776
|
+
print(f" ⤵ collecting voting results...")
|
3777
|
+
#! dict_pred_vote
|
3778
|
+
dict_pred_vote = {}
|
3779
|
+
validation_scores_vote = {}
|
3780
|
+
if y_true is not None and y_pred_proba_vote is not None:
|
3781
|
+
validation_scores_vote = cal_metrics(
|
3782
|
+
y_true,
|
3783
|
+
y_pred_vote,
|
3784
|
+
y_pred_proba=y_pred_proba_vote,
|
3785
|
+
is_binary=is_binary,
|
3786
|
+
purpose=purpose,
|
3787
|
+
average="weighted",
|
3788
|
+
)
|
3789
|
+
|
3790
|
+
if is_binary:
|
3791
|
+
if y_pred_proba_vote is not None:
|
3792
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
|
3793
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3794
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3795
|
+
)
|
3796
|
+
roc_auc = auc(fpr, tpr)
|
3797
|
+
roc_info = {
|
3798
|
+
"fpr": fpr.tolist(),
|
3799
|
+
"tpr": tpr.tolist(),
|
3800
|
+
"auc": roc_auc,
|
3801
|
+
"ci95": (lower_ci, upper_ci),
|
3802
|
+
}
|
3803
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
|
3804
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
|
3805
|
+
pr_info = {
|
3806
|
+
"precision": precision_,
|
3807
|
+
"recall": recall_,
|
3808
|
+
"avg_precision": avg_precision_,
|
3809
|
+
}
|
3810
|
+
else:
|
3811
|
+
roc_info, pr_info = None, None
|
3812
|
+
|
3813
|
+
dict_pred_vote = {
|
3814
|
+
"best_clf": voting_model,
|
3815
|
+
"best_params": None,
|
3816
|
+
"auc_indiv": None,
|
3817
|
+
"scores": validation_scores_vote,
|
3818
|
+
"roc_curve": roc_info,
|
3819
|
+
"pr_curve": pr_info,
|
3820
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3821
|
+
"predictions": y_pred_vote.tolist(),
|
3822
|
+
"predictions_proba": (
|
3823
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
|
+
),
|
3825
|
+
}
|
3826
|
+
else: # Multi-class
|
3827
|
+
if y_pred_proba_vote is not None:
|
3828
|
+
confidence_intervals = cal_auc_ci(
|
3829
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3830
|
+
)
|
3831
|
+
roc_info = {
|
3832
|
+
"fpr": validation_scores_vote["fpr"],
|
3833
|
+
"tpr": validation_scores_vote["tpr"],
|
3834
|
+
"auc": validation_scores_vote["roc_auc_by_class"],
|
3835
|
+
"ci95": confidence_intervals,
|
3836
|
+
}
|
3837
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3838
|
+
y_true, y_pred_proba_vote, is_binary=is_binary
|
3839
|
+
)
|
3840
|
+
pr_info = {
|
3841
|
+
"precision": precision_,
|
3842
|
+
"recall": recall_,
|
3843
|
+
"avg_precision": avg_precision_,
|
3844
|
+
}
|
3845
|
+
else:
|
3846
|
+
roc_info, pr_info = None, None
|
3847
|
+
|
3848
|
+
dict_pred_vote = {
|
3849
|
+
"best_clf": voting_model,
|
3850
|
+
"best_params": None,
|
3851
|
+
"scores": validation_scores_vote,
|
3852
|
+
"roc_curve": roc_info,
|
3853
|
+
"pr_curve": pr_info,
|
3854
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3855
|
+
"predictions": y_pred_vote.tolist(),
|
3856
|
+
"predictions_proba": (
|
3857
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
|
+
),
|
3859
|
+
}
|
3860
|
+
else:
|
3861
|
+
if y_true is None:
|
3862
|
+
validation_scores_vote = []
|
3863
|
+
else:
|
3864
|
+
validation_scores_vote = cal_metrics(
|
3865
|
+
y_true,
|
3866
|
+
y_pred,
|
3867
|
+
y_pred_proba=y_pred_proba_vote,
|
3868
|
+
is_binary=is_binary,
|
3869
|
+
purpose=purpose,
|
3870
|
+
average="weighted",
|
3871
|
+
)
|
3872
|
+
dict_pred_vote = {
|
3873
|
+
"best_clf": voting_model,
|
3874
|
+
"best_params": None,
|
3875
|
+
"scores": validation_scores_vote,
|
3876
|
+
"predictions": y_pred_vote.tolist(),
|
3877
|
+
"predictions_proba": (
|
3878
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
|
+
),
|
3880
|
+
"y_train": y_train if y_train is not None else [],
|
3881
|
+
"y_true": y_true if y_true is not None else [],
|
3882
|
+
}
|
3883
|
+
df_vote = pd.DataFrame(
|
3884
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
|
3885
|
+
for k, v in dict_pred_vote.items():
|
3886
|
+
if k in df_vote.columns:
|
3887
|
+
df_vote[k] = [v]
|
3888
|
+
|
3889
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3890
|
+
# try:
|
3891
|
+
# plot_validate_features_single(df_vote, is_binary=is_binary)
|
3892
|
+
# if dir_save:
|
3893
|
+
# ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
|
3894
|
+
# except Exception as e:
|
3895
|
+
# print(e)
|
3896
|
+
print("Done")
|
3897
|
+
if vote and stack:
|
3898
|
+
df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
|
3899
|
+
elif vote:
|
3900
|
+
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
3901
|
+
elif stack:
|
3902
|
+
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
3903
|
+
|
3218
3904
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3905
|
+
from datetime import datetime
|
3906
|
+
|
3907
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3219
3908
|
# try:
|
3220
|
-
if
|
3221
|
-
plot_validate_features(
|
3909
|
+
if df_res.shape[0] > 3:
|
3910
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
3222
3911
|
else:
|
3223
|
-
plot_validate_features_single(
|
3912
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
3224
3913
|
if dir_save:
|
3225
3914
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3226
|
-
|
3227
|
-
|
3228
|
-
return
|
3229
|
-
|
3915
|
+
# except Exception as e:
|
3916
|
+
# print(f"Error: 在画图的过程中出现了问题:{e}")
|
3917
|
+
return df_res
|
3230
3918
|
|
3231
3919
|
def cal_metrics(
|
3232
3920
|
y_true,
|
@@ -3368,7 +4056,7 @@ def cal_metrics(
|
|
3368
4056
|
|
3369
4057
|
|
3370
4058
|
def plot_trees(
|
3371
|
-
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
4059
|
+
X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3372
4060
|
):
|
3373
4061
|
"""
|
3374
4062
|
# # Example usage:
|
@@ -3414,10 +4102,14 @@ def plot_trees(
|
|
3414
4102
|
train_error_rate = []
|
3415
4103
|
test_error_rate = []
|
3416
4104
|
validation_error = None
|
3417
|
-
|
4105
|
+
if isinstance(cls, str):
|
4106
|
+
cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
|
3418
4107
|
# Configure classifier based on type
|
3419
4108
|
oob_enabled = False # Default to no OOB error unless explicitly set
|
3420
|
-
|
4109
|
+
clf_support = {"RandomForestClassifier":RandomForestClassifier(),
|
4110
|
+
"ExtraTreesClassifier":ExtraTreesClassifier(),
|
4111
|
+
"AdaBoostClassifier":AdaBoostClassifier(),
|
4112
|
+
"GradientBoostingClassifier":GradientBoostingClassifier()}
|
3421
4113
|
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3422
4114
|
# Enable OOB if cls supports it and is using bootstrapping
|
3423
4115
|
cls.set_params(warm_start=True, n_estimators=1)
|
@@ -3679,7 +4371,7 @@ def img_datasets_preprocessing(
|
|
3679
4371
|
|
3680
4372
|
|
3681
4373
|
def backward_regression(
|
3682
|
-
X: pd.DataFrame, y: pd.Series, initial_list=[],
|
4374
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
|
3683
4375
|
):
|
3684
4376
|
"""
|
3685
4377
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
@@ -3691,31 +4383,46 @@ def backward_regression(
|
|
3691
4383
|
X -- features values
|
3692
4384
|
y -- target variable
|
3693
4385
|
initial_list -- features header
|
3694
|
-
|
4386
|
+
thr -- pvalue threshold of features to drop
|
3695
4387
|
verbose -- true to produce lots of logging output
|
3696
4388
|
|
3697
4389
|
Returns:
|
3698
4390
|
list of selected features for modeling
|
3699
4391
|
"""
|
3700
4392
|
import statsmodels.api as sm
|
3701
|
-
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
3705
|
-
|
4393
|
+
if isinstance(y, str):
|
4394
|
+
if y in X.columns:
|
4395
|
+
y_col_name = y
|
4396
|
+
y = X[y]
|
4397
|
+
X = X.drop(y_col_name, axis=1)
|
4398
|
+
else:
|
4399
|
+
raise ValueError(f"找不到{y},y设置有误")
|
4400
|
+
X = X.select_dtypes(include=[np.number])
|
4401
|
+
|
3706
4402
|
included = list(X.columns)
|
4403
|
+
try:
|
4404
|
+
X=X.astype(float)
|
4405
|
+
y=y.astype(float)
|
4406
|
+
except Exception as e:
|
4407
|
+
raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
|
4408
|
+
|
4409
|
+
|
3707
4410
|
while True:
|
3708
4411
|
changed = False
|
4412
|
+
if not included:
|
4413
|
+
print("No features remain in the model.")
|
4414
|
+
break
|
4415
|
+
|
3709
4416
|
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3710
4417
|
# exclude the intercept for p-value checking
|
3711
4418
|
pvalues = model.pvalues.iloc[1:]
|
3712
4419
|
worst_pval = pvalues.max()
|
3713
|
-
if worst_pval >
|
4420
|
+
if worst_pval > thr:
|
3714
4421
|
changed = True
|
3715
4422
|
worst_feature = pvalues.idxmax()
|
3716
4423
|
included.remove(worst_feature)
|
3717
4424
|
if verbose:
|
3718
|
-
print(f"Removing
|
4425
|
+
print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
|
3719
4426
|
if not changed:
|
3720
4427
|
break
|
3721
4428
|
print(f"\nSelected Features:\n{included}")
|