py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +297 -229
- py2ls/ml2ls.py +996 -155
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +15 -11
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
|
|
5
5
|
BaggingClassifier,
|
6
6
|
)
|
7
7
|
from sklearn.svm import SVC, SVR
|
8
|
-
from sklearn.calibration import CalibratedClassifierCV
|
9
8
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
10
9
|
from sklearn.linear_model import (
|
11
10
|
LassoCV,
|
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
|
|
16
15
|
RidgeClassifierCV,
|
17
16
|
ElasticNet,
|
18
17
|
)
|
19
|
-
|
20
|
-
from sklearn.naive_bayes import GaussianNB
|
21
|
-
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
22
|
-
import xgboost as xgb # Make sure you have xgboost installed
|
23
|
-
|
24
|
-
from sklearn.model_selection import train_test_split, cross_val_score
|
18
|
+
|
25
19
|
from sklearn.metrics import (
|
26
20
|
accuracy_score,
|
27
21
|
precision_score,
|
@@ -36,18 +30,12 @@ from sklearn.metrics import (
|
|
36
30
|
precision_recall_curve,
|
37
31
|
average_precision_score,
|
38
32
|
)
|
39
|
-
from imblearn.over_sampling import SMOTE
|
40
|
-
from sklearn.pipeline import Pipeline
|
41
|
-
from collections import defaultdict
|
42
|
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
43
33
|
from typing import Dict, Any, Optional, List, Union
|
44
34
|
import numpy as np
|
45
35
|
import pandas as pd
|
46
36
|
from . import ips
|
47
37
|
from . import plot
|
48
38
|
import matplotlib.pyplot as plt
|
49
|
-
import seaborn as sns
|
50
|
-
|
51
39
|
plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
|
52
40
|
import logging
|
53
41
|
import warnings
|
@@ -314,6 +302,8 @@ def features_svm(
|
|
314
302
|
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
315
303
|
S-shaped relationships.
|
316
304
|
"""
|
305
|
+
from sklearn.feature_selection import RFE
|
306
|
+
from sklearn.svm import SVC
|
317
307
|
# SVM (Support Vector Machines)
|
318
308
|
svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
|
319
309
|
# RFE(Recursive Feature Elimination)
|
@@ -450,6 +440,7 @@ def validate_classifier(
|
|
450
440
|
Returns:
|
451
441
|
- results: Dictionary containing average cv_train_scores and cv_test_scores.
|
452
442
|
"""
|
443
|
+
from sklearn.model_selection import cross_val_score
|
453
444
|
cv_train_scores = {metric: [] for metric in metrics}
|
454
445
|
skf = StratifiedKFold(n_splits=cv_folds)
|
455
446
|
# Perform cross-validation
|
@@ -982,6 +973,8 @@ def validate_features(
|
|
982
973
|
|
983
974
|
"""
|
984
975
|
from tqdm import tqdm
|
976
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
977
|
+
from sklearn.calibration import CalibratedClassifierCV
|
985
978
|
|
986
979
|
# Ensure common features are selected
|
987
980
|
common_features = ips.shared(
|
@@ -1001,6 +994,7 @@ def validate_features(
|
|
1001
994
|
|
1002
995
|
# Handle class imbalance using SMOTE
|
1003
996
|
if smote:
|
997
|
+
from imblearn.over_sampling import SMOTE
|
1004
998
|
if (
|
1005
999
|
y_train.value_counts(normalize=True).max() < 0.8
|
1006
1000
|
): # Threshold to decide if data is imbalanced
|
@@ -2096,19 +2090,136 @@ def rank_models(
|
|
2096
2090
|
# )
|
2097
2091
|
|
2098
2092
|
# figsave("classifier_performance.pdf")
|
2093
|
+
def rank_models_reg(df, ascending=False):
|
2094
|
+
"""
|
2095
|
+
Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
|
2099
2096
|
|
2097
|
+
Parameters:
|
2098
|
+
df (pd.DataFrame): DataFrame containing the regression metrics.
|
2099
|
+
ascending (bool): Whether to sort in ascending order of ranking score.
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
|
2103
|
+
"""
|
2104
|
+
# Define weights for the 4 metrics
|
2105
|
+
weights = {
|
2106
|
+
"mse": -1, # Lower is better
|
2107
|
+
"rmse": -1, # Lower is better
|
2108
|
+
"mae": -1, # Lower is better
|
2109
|
+
"r2": 1, # Higher is better
|
2110
|
+
}
|
2111
|
+
|
2112
|
+
# Normalize the selected metrics
|
2113
|
+
df = df.copy() # Work on a copy of the DataFrame
|
2114
|
+
for metric, weight in weights.items():
|
2115
|
+
if metric in df.columns:
|
2116
|
+
if weight > 0: # Higher is better; normalize 0-1
|
2117
|
+
df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
|
2118
|
+
df[metric].max() - df[metric].min()
|
2119
|
+
)
|
2120
|
+
else: # Lower is better; reverse normalize 0-1
|
2121
|
+
df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
|
2122
|
+
df[metric].max() - df[metric].min()
|
2123
|
+
)
|
2124
|
+
|
2125
|
+
# Calculate ranking score as a weighted sum
|
2126
|
+
df["Ranking_Score"] = sum(
|
2127
|
+
df[metric + "_normalized"] * abs(weights[metric])
|
2128
|
+
for metric in weights.keys()
|
2129
|
+
if metric + "_normalized" in df.columns
|
2130
|
+
)
|
2131
|
+
|
2132
|
+
# Sort models based on the ranking score
|
2133
|
+
sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
|
2134
|
+
return sorted_df
|
2135
|
+
|
2136
|
+
models_support = {
|
2137
|
+
"classification": {
|
2138
|
+
"Random Forest": "Tree-Based",
|
2139
|
+
"SVM": "Kernel-Based",
|
2140
|
+
"Logistic Regression": "Linear",
|
2141
|
+
"Lasso Logistic Regression": "Linear",
|
2142
|
+
"Gradient Boosting": "Tree-Based",
|
2143
|
+
"XGBoost": "Tree-Based",
|
2144
|
+
"KNN": "Instance-Based",
|
2145
|
+
"Naive Bayes": "Probabilistic",
|
2146
|
+
"Linear Discriminant Analysis": "Linear",
|
2147
|
+
"AdaBoost": "Tree-Based",
|
2148
|
+
"CatBoost": "Tree-Based",
|
2149
|
+
"Extra Trees": "Tree-Based",
|
2150
|
+
"Bagging": "Tree-Based",
|
2151
|
+
"Neural Network": "Neural Network",
|
2152
|
+
"DecisionTree": "Tree-Based",
|
2153
|
+
"Quadratic Discriminant Analysis": "Probabilistic",
|
2154
|
+
"Ridge": "Linear",
|
2155
|
+
"Perceptron": "Linear",
|
2156
|
+
"Bernoulli Naive Bayes": "Probabilistic",
|
2157
|
+
"SGDClassifier": "Linear",
|
2158
|
+
},
|
2159
|
+
"regression": {
|
2160
|
+
"Linear Regression": "Linear",
|
2161
|
+
"Ridge": "Linear",
|
2162
|
+
"RidgeCV": "Linear",
|
2163
|
+
"TheilSenRegressor": "Linear",
|
2164
|
+
"HuberRegressor": "Linear",
|
2165
|
+
"PoissonRegressor": "Linear",
|
2166
|
+
"LassoCV": "Linear",
|
2167
|
+
"Bagging": "Tree-Based",
|
2168
|
+
"ElasticNet": "Linear",
|
2169
|
+
"Random Forest": "Tree-Based",
|
2170
|
+
"Gradient Boosting": "Tree-Based",
|
2171
|
+
"XGBoost": "Tree-Based",
|
2172
|
+
"CatBoost": "Tree-Based",
|
2173
|
+
"Extra Trees": "Tree-Based",
|
2174
|
+
"SVM": "Kernel-Based",
|
2175
|
+
"KNN": "Instance-Based",
|
2176
|
+
"Neural Network": "Neural Network",
|
2177
|
+
"AdaBoost": "Linear",
|
2178
|
+
},
|
2179
|
+
}
|
2180
|
+
def select_top_models(models, categories, n_top_models, n_models_per_category=1):
|
2181
|
+
"""
|
2182
|
+
models = list_sort
|
2183
|
+
purpose = "regression"
|
2184
|
+
categories = models_support[purpose]
|
2185
|
+
n_top_models = 3
|
2186
|
+
select_top_models(models, categories, n_top_models)
|
2187
|
+
"""
|
2188
|
+
selected = {}
|
2189
|
+
result = []
|
2190
|
+
for model in models:
|
2191
|
+
category = categories.get(model, "Unknown")
|
2192
|
+
if category not in selected:
|
2193
|
+
selected[category] = 0 # Initialize counter for the category
|
2194
|
+
|
2195
|
+
if selected[category] < n_models_per_category: # Allow additional models up to the limit
|
2196
|
+
selected[category] += 1
|
2197
|
+
result.append(model)
|
2198
|
+
|
2199
|
+
if len(result) == n_top_models: # Stop when the desired number of models is reached
|
2200
|
+
break
|
2201
|
+
|
2202
|
+
return result
|
2100
2203
|
|
2101
2204
|
def predict(
|
2102
2205
|
x_train: pd.DataFrame,
|
2103
2206
|
y_train: pd.Series,
|
2104
2207
|
x_true: pd.DataFrame = None,
|
2105
2208
|
y_true: Optional[pd.Series] = None,
|
2209
|
+
fill_missing:bool = True,
|
2210
|
+
scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
|
2106
2211
|
backward: bool = False, # backward_regression
|
2212
|
+
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2107
2213
|
common_features: set = None,
|
2108
2214
|
purpose: str = "classification", # 'classification' or 'regression'
|
2109
2215
|
cls: Optional[Dict[str, Any]] = None,
|
2110
2216
|
metrics: Optional[List[str]] = None,
|
2111
|
-
|
2217
|
+
stack:bool=True,# run stacking
|
2218
|
+
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2219
|
+
vote:bool=True,# run voting
|
2220
|
+
voting:str="hard", # only for classification purporse of voting
|
2221
|
+
n_top_models:int=5, #for stacking models
|
2222
|
+
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
2112
2223
|
smote: bool = False,
|
2113
2224
|
n_jobs: int = -1,
|
2114
2225
|
plot_: bool = True,
|
@@ -2117,6 +2228,7 @@ def predict(
|
|
2117
2228
|
cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
|
2118
2229
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2119
2230
|
class_weight: str = "balanced",
|
2231
|
+
random_state: int = 1,
|
2120
2232
|
verbose: bool = False,
|
2121
2233
|
) -> pd.DataFrame:
|
2122
2234
|
"""
|
@@ -2184,10 +2296,17 @@ def predict(
|
|
2184
2296
|
RidgeClassifierCV,
|
2185
2297
|
Perceptron,
|
2186
2298
|
SGDClassifier,
|
2299
|
+
RidgeCV,
|
2300
|
+
Ridge,
|
2301
|
+
TheilSenRegressor,
|
2302
|
+
HuberRegressor,
|
2303
|
+
PoissonRegressor,
|
2304
|
+
|
2187
2305
|
)
|
2306
|
+
from sklearn.compose import TransformedTargetRegressor
|
2188
2307
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
2189
2308
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
2190
|
-
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
2309
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
|
2191
2310
|
import xgboost as xgb
|
2192
2311
|
import lightgbm as lgb
|
2193
2312
|
import catboost as cb
|
@@ -2198,6 +2317,7 @@ def predict(
|
|
2198
2317
|
QuadraticDiscriminantAnalysis,
|
2199
2318
|
)
|
2200
2319
|
from sklearn.preprocessing import PolynomialFeatures
|
2320
|
+
from sklearn.model_selection import train_test_split
|
2201
2321
|
|
2202
2322
|
# 拼写检查
|
2203
2323
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
@@ -2206,7 +2326,7 @@ def predict(
|
|
2206
2326
|
if purpose == "classification":
|
2207
2327
|
model_ = {
|
2208
2328
|
"Random Forest": RandomForestClassifier(
|
2209
|
-
random_state=random_state, class_weight=class_weight
|
2329
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2210
2330
|
),
|
2211
2331
|
# SVC (Support Vector Classification)
|
2212
2332
|
"SVM": SVC(
|
@@ -2217,7 +2337,7 @@ def predict(
|
|
2217
2337
|
),
|
2218
2338
|
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
2219
2339
|
"Logistic Regression": LogisticRegression(
|
2220
|
-
class_weight=class_weight, random_state=random_state
|
2340
|
+
class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
|
2221
2341
|
),
|
2222
2342
|
# Logistic Regression with L1 Regularization (Lasso)
|
2223
2343
|
"Lasso Logistic Regression": LogisticRegression(
|
@@ -2228,53 +2348,54 @@ def predict(
|
|
2228
2348
|
eval_metric="logloss",
|
2229
2349
|
random_state=random_state,
|
2230
2350
|
),
|
2231
|
-
"KNN": KNeighborsClassifier(n_neighbors=5),
|
2351
|
+
"KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
|
2232
2352
|
"Naive Bayes": GaussianNB(),
|
2233
2353
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
2234
2354
|
"AdaBoost": AdaBoostClassifier(
|
2235
2355
|
algorithm="SAMME", random_state=random_state
|
2236
2356
|
),
|
2237
|
-
|
2357
|
+
"LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
|
2238
2358
|
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
2239
2359
|
"Extra Trees": ExtraTreesClassifier(
|
2240
|
-
random_state=random_state, class_weight=class_weight
|
2360
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2241
2361
|
),
|
2242
|
-
"Bagging": BaggingClassifier(random_state=random_state),
|
2362
|
+
"Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
|
2243
2363
|
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
2244
2364
|
"DecisionTree": DecisionTreeClassifier(),
|
2245
2365
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
2246
2366
|
"Ridge": RidgeClassifierCV(
|
2247
2367
|
class_weight=class_weight, store_cv_results=True
|
2248
2368
|
),
|
2249
|
-
"Perceptron": Perceptron(random_state=random_state),
|
2369
|
+
"Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
|
2250
2370
|
"Bernoulli Naive Bayes": BernoulliNB(),
|
2251
|
-
"SGDClassifier": SGDClassifier(random_state=random_state),
|
2371
|
+
"SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
|
2252
2372
|
}
|
2253
2373
|
elif purpose == "regression":
|
2254
2374
|
model_ = {
|
2255
|
-
"Random Forest": RandomForestRegressor(random_state=random_state),
|
2375
|
+
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2256
2376
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2257
2377
|
# "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
|
2258
2378
|
"LassoCV": LassoCV(
|
2259
|
-
cv=cv_folds, random_state=random_state
|
2379
|
+
cv=cv_folds, random_state=random_state,n_jobs=n_jobs
|
2260
2380
|
), # LassoCV自动找出最适alpha,优于Lasso
|
2261
2381
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2262
|
-
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2263
|
-
"Linear Regression": LinearRegression(),
|
2264
|
-
"Lasso": Lasso(random_state=random_state),
|
2382
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2383
|
+
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2265
2384
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2266
|
-
|
2385
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
|
2386
|
+
force_row_wise=True # Or use force_col_wise=True if memory is a concern
|
2387
|
+
),
|
2267
2388
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2268
|
-
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
2269
|
-
"Bagging": BaggingRegressor(random_state=random_state),
|
2389
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2390
|
+
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2270
2391
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2271
2392
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2272
2393
|
"Ridge": Ridge(),
|
2273
|
-
"KNN": KNeighborsRegressor(),
|
2394
|
+
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2395
|
+
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2396
|
+
"Huber":HuberRegressor(),
|
2397
|
+
"Poisson":PoissonRegressor()
|
2274
2398
|
}
|
2275
|
-
# indicate cls:
|
2276
|
-
if ips.run_once_within(30): # 10 min
|
2277
|
-
print(f"supported models: {list(model_.keys())}")
|
2278
2399
|
if cls is None:
|
2279
2400
|
models = model_
|
2280
2401
|
else:
|
@@ -2290,6 +2411,10 @@ def predict(
|
|
2290
2411
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2291
2412
|
)
|
2292
2413
|
|
2414
|
+
# indicate cls:
|
2415
|
+
if ips.run_once_within(30): # 10 min
|
2416
|
+
print(f"processing: {list(models.keys())}")
|
2417
|
+
print(isinstance(y_train, str) and y_train in x_train.columns)
|
2293
2418
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2294
2419
|
y_train_col_name = y_train
|
2295
2420
|
y_train = x_train[y_train]
|
@@ -2297,6 +2422,7 @@ def predict(
|
|
2297
2422
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2298
2423
|
# else:
|
2299
2424
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2425
|
+
|
2300
2426
|
y_train = pd.DataFrame(y_train)
|
2301
2427
|
if y_train.select_dtypes(include=np.number).empty:
|
2302
2428
|
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
@@ -2309,9 +2435,12 @@ def predict(
|
|
2309
2435
|
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2310
2436
|
print("is_binary:", is_binary)
|
2311
2437
|
|
2438
|
+
if fill_missing:
|
2439
|
+
ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
|
2440
|
+
ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
|
2312
2441
|
# Perform backward feature selection
|
2313
2442
|
if backward:
|
2314
|
-
selected_features = backward_regression(x_train, y_train,
|
2443
|
+
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
2315
2444
|
x_train = x_train[selected_features]
|
2316
2445
|
|
2317
2446
|
if x_true is None:
|
@@ -2337,6 +2466,8 @@ def predict(
|
|
2337
2466
|
pd.DataFrame(y_train), method="label"
|
2338
2467
|
).values.ravel()
|
2339
2468
|
|
2469
|
+
if fill_missing:
|
2470
|
+
ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
|
2340
2471
|
if y_true is not None:
|
2341
2472
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2342
2473
|
y_true_col_name = y_true
|
@@ -2369,11 +2500,16 @@ def predict(
|
|
2369
2500
|
# Ensure common features are selected
|
2370
2501
|
if common_features is not None:
|
2371
2502
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
2503
|
+
share_col_names=common_features
|
2372
2504
|
else:
|
2373
2505
|
share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
|
2374
2506
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2375
2507
|
|
2376
|
-
|
2508
|
+
#! scaler
|
2509
|
+
# scaler and fit x_train and export scaler to fit the x_true
|
2510
|
+
x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
|
2511
|
+
#
|
2512
|
+
x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
|
2377
2513
|
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2378
2514
|
x_true, method="dummy"
|
2379
2515
|
)
|
@@ -2395,7 +2531,19 @@ def predict(
|
|
2395
2531
|
if isinstance(y_train, np.ndarray):
|
2396
2532
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2397
2533
|
y_true = np.asarray(y_true)
|
2534
|
+
|
2398
2535
|
# Hyperparameter grids for tuning
|
2536
|
+
param_grid_common_xgb = {
|
2537
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2538
|
+
'max_depth': [3, 5, 7, 10],
|
2539
|
+
'n_estimators': [50, 100, 200, 300],
|
2540
|
+
'subsample': [0.6, 0.8, 1.0],
|
2541
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2542
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2543
|
+
'min_child_weight': [1, 5, 10],
|
2544
|
+
'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
|
2545
|
+
'reg_lambda': [1, 1.5, 2], # L2 regularization term
|
2546
|
+
}
|
2399
2547
|
if cv_level in ["low", "simple", "s", "l"]:
|
2400
2548
|
param_grids = {
|
2401
2549
|
"Random Forest": (
|
@@ -2440,12 +2588,17 @@ def predict(
|
|
2440
2588
|
"min_samples_split": [2],
|
2441
2589
|
"subsample": [0.8],
|
2442
2590
|
},
|
2443
|
-
"XGBoost":
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2447
|
-
|
2448
|
-
|
2591
|
+
"XGBoost":{
|
2592
|
+
'learning_rate': [0.01],
|
2593
|
+
'max_depth': [3],
|
2594
|
+
'n_estimators': [50],
|
2595
|
+
'subsample': [0.6],
|
2596
|
+
'colsample_bytree': [0.6],
|
2597
|
+
'gamma': [0, 0.1],
|
2598
|
+
'min_child_weight': [1],
|
2599
|
+
'reg_alpha': [0, 0.1],
|
2600
|
+
'reg_lambda': [1],
|
2601
|
+
'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
|
2449
2602
|
},
|
2450
2603
|
"KNN": (
|
2451
2604
|
{
|
@@ -2552,6 +2705,14 @@ def predict(
|
|
2552
2705
|
"random_state": [random_state],
|
2553
2706
|
"learning_rate": ["constant"],
|
2554
2707
|
},
|
2708
|
+
"TheilSen":{'max_iter': [100],
|
2709
|
+
'tol': [1e-4],
|
2710
|
+
'n_subsamples': [100+x_train.shape[1]]},
|
2711
|
+
"Huber":{'epsilon': [1.35],
|
2712
|
+
'alpha': [0.1],
|
2713
|
+
'max_iter': [100],},
|
2714
|
+
"Poisson":{'alpha': [0.1],
|
2715
|
+
'max_iter': [100],}
|
2555
2716
|
}
|
2556
2717
|
elif cv_level in ["high", "advanced", "h"]:
|
2557
2718
|
param_grids = {
|
@@ -2613,12 +2774,30 @@ def predict(
|
|
2613
2774
|
"subsample": [0.8, 1.0],
|
2614
2775
|
},
|
2615
2776
|
"XGBoost": {
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2777
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2778
|
+
'max_depth': [3, 5, 7, 10],
|
2779
|
+
'n_estimators': [50, 100, 200, 300],
|
2780
|
+
'subsample': [0.6, 0.8, 1.0],
|
2781
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2782
|
+
'min_child_weight': [1, 5, 10],
|
2783
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2784
|
+
'reg_lambda': [1, 1.5, 2],
|
2785
|
+
**{
|
2786
|
+
'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
|
2787
|
+
}} if purpose== "classification"
|
2788
|
+
else{
|
2789
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2790
|
+
'max_depth': [3, 5, 7, 10],
|
2791
|
+
'n_estimators': [50, 100, 200, 300],
|
2792
|
+
'subsample': [0.6, 0.8, 1.0],
|
2793
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2794
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2795
|
+
'min_child_weight': [1, 5, 10],
|
2796
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2797
|
+
'reg_lambda': [1, 1.5, 2],
|
2798
|
+
**{
|
2799
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
|
2800
|
+
}},
|
2622
2801
|
"KNN": (
|
2623
2802
|
{
|
2624
2803
|
"n_neighbors": [1, 3, 5, 10, 15, 20],
|
@@ -2731,6 +2910,14 @@ def predict(
|
|
2731
2910
|
], # If True, the regressors X will be normalized
|
2732
2911
|
}
|
2733
2912
|
),
|
2913
|
+
"TheilSen":{'max_iter': [100, 200, 300],
|
2914
|
+
'tol': [1e-4, 1e-3, 1e-2],
|
2915
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
|
2916
|
+
"Huber":{'epsilon': [1.35, 1.5, 2.0],
|
2917
|
+
'alpha': [0.1, 1.0, 10.0],
|
2918
|
+
'max_iter': [100, 200, 300],},
|
2919
|
+
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2920
|
+
'max_iter': [100, 200, 300],}
|
2734
2921
|
}
|
2735
2922
|
else: # median level
|
2736
2923
|
param_grids = {
|
@@ -2790,12 +2977,30 @@ def predict(
|
|
2790
2977
|
"subsample": [0.8, 1.0],
|
2791
2978
|
},
|
2792
2979
|
"XGBoost": {
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
|
2797
|
-
|
2798
|
-
|
2980
|
+
'learning_rate': [0.01, 0.1],
|
2981
|
+
'max_depth': [3, 5],
|
2982
|
+
'n_estimators': [50, 100],
|
2983
|
+
'subsample': [0.6, 0.8],
|
2984
|
+
'gamma': [0, 0.1],
|
2985
|
+
'min_child_weight': [1, 5],
|
2986
|
+
'reg_alpha': [0, 0.1],
|
2987
|
+
'reg_lambda': [1,],
|
2988
|
+
**{
|
2989
|
+
'objective': ['binary:logistic', 'multi:softmax'],
|
2990
|
+
}} if purpose== "classification"
|
2991
|
+
else{
|
2992
|
+
'learning_rate': [0.01, 0.1],
|
2993
|
+
'max_depth': [3, 5,],
|
2994
|
+
'n_estimators': [50, 100],
|
2995
|
+
'subsample': [0.6, 0.8],
|
2996
|
+
'colsample_bytree': [0.6, 0.8],
|
2997
|
+
'gamma': [0, 0.1],
|
2998
|
+
'min_child_weight': [1, 5],
|
2999
|
+
'reg_alpha': [0, 0.1],
|
3000
|
+
'reg_lambda': [1, 1.5],
|
3001
|
+
**{
|
3002
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror'],
|
3003
|
+
}},
|
2799
3004
|
"KNN": (
|
2800
3005
|
{
|
2801
3006
|
"n_neighbors": [3, 5, 7, 10],
|
@@ -2952,6 +3157,14 @@ def predict(
|
|
2952
3157
|
], # Solver for optimization
|
2953
3158
|
}
|
2954
3159
|
),
|
3160
|
+
"TheilSen":{'max_iter': [100, 200],
|
3161
|
+
'tol': [1e-4, 1e-3],
|
3162
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
|
3163
|
+
"Huber":{'epsilon': [1.35, 1.5],
|
3164
|
+
'alpha': [0.1, 1.0],
|
3165
|
+
'max_iter': [100, 200],},
|
3166
|
+
"Poisson":{'alpha': [0.1, 1.0],
|
3167
|
+
'max_iter': [100, 200],}
|
2955
3168
|
}
|
2956
3169
|
|
2957
3170
|
results = {}
|
@@ -2971,83 +3184,124 @@ def predict(
|
|
2971
3184
|
):
|
2972
3185
|
if verbose:
|
2973
3186
|
print(f"\nTraining and validating {name}:")
|
2974
|
-
|
2975
|
-
|
2976
|
-
|
2977
|
-
|
2978
|
-
|
2979
|
-
|
2980
|
-
|
2981
|
-
|
2982
|
-
|
2983
|
-
|
2984
|
-
|
2985
|
-
|
2986
|
-
|
2987
|
-
|
2988
|
-
)
|
2989
|
-
|
2990
|
-
gs.fit(x_train, y_train)
|
2991
|
-
best_clf = gs.best_estimator_
|
2992
|
-
# make sure x_train and x_test has the same name
|
2993
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2994
|
-
y_pred = best_clf.predict(x_true)
|
2995
|
-
if hasattr(best_clf, "predict_proba"):
|
2996
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
2997
|
-
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
2998
|
-
if y_pred_proba.shape[1] == 1:
|
2999
|
-
y_pred_proba = np.hstack(
|
3000
|
-
[1 - y_pred_proba, y_pred_proba]
|
3001
|
-
) # Add missing class probabilities
|
3002
|
-
y_pred_proba = y_pred_proba[:, 1]
|
3003
|
-
elif hasattr(best_clf, "decision_function"):
|
3004
|
-
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3005
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3006
|
-
# Ensure y_pred_proba is within 0 and 1 bounds
|
3007
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3008
|
-
y_pred_proba.max() - y_pred_proba.min()
|
3187
|
+
try:
|
3188
|
+
# Grid search with KFold or StratifiedKFold
|
3189
|
+
if is_binary:
|
3190
|
+
gs = GridSearchCV(
|
3191
|
+
clf,
|
3192
|
+
param_grid=param_grids.get(name, {}),
|
3193
|
+
scoring=(
|
3194
|
+
"roc_auc"
|
3195
|
+
if purpose == "classification"
|
3196
|
+
else "neg_mean_squared_error"
|
3197
|
+
),
|
3198
|
+
cv=cv,
|
3199
|
+
n_jobs=n_jobs,
|
3200
|
+
verbose=verbose,
|
3009
3201
|
)
|
3010
|
-
else:
|
3011
|
-
y_pred_proba = None # No probability output for certain models
|
3012
|
-
else:
|
3013
|
-
gs = GridSearchCV(
|
3014
|
-
clf,
|
3015
|
-
param_grid=param_grids.get(name, {}),
|
3016
|
-
scoring=(
|
3017
|
-
"roc_auc_ovr"
|
3018
|
-
if purpose == "classification"
|
3019
|
-
else "neg_mean_squared_error"
|
3020
|
-
),
|
3021
|
-
cv=cv,
|
3022
|
-
n_jobs=n_jobs,
|
3023
|
-
verbose=verbose,
|
3024
|
-
)
|
3025
|
-
|
3026
|
-
# Fit GridSearchCV
|
3027
|
-
gs.fit(x_train, y_train)
|
3028
|
-
best_clf = gs.best_estimator_
|
3029
|
-
|
3030
|
-
# Ensure x_true aligns with x_train columns
|
3031
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3032
|
-
y_pred = best_clf.predict(x_true)
|
3033
|
-
|
3034
|
-
# Handle prediction probabilities for multiclass
|
3035
|
-
if hasattr(best_clf, "predict_proba"):
|
3036
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3037
|
-
elif hasattr(best_clf, "decision_function"):
|
3038
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3039
3202
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
|
3043
|
-
|
3044
|
-
|
3045
|
-
|
3046
|
-
|
3203
|
+
gs.fit(x_train, y_train)
|
3204
|
+
best_clf = gs.best_estimator_
|
3205
|
+
# make sure x_train and x_test has the same name
|
3206
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3207
|
+
y_pred = best_clf.predict(x_true)
|
3208
|
+
if hasattr(best_clf, "predict_proba"):
|
3209
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3210
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3211
|
+
if y_pred_proba.shape[1] == 1:
|
3212
|
+
y_pred_proba = np.hstack(
|
3213
|
+
[1 - y_pred_proba, y_pred_proba]
|
3214
|
+
) # Add missing class probabilities
|
3215
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3216
|
+
elif hasattr(best_clf, "decision_function"):
|
3217
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3218
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3219
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
3220
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3221
|
+
y_pred_proba.max() - y_pred_proba.min()
|
3047
3222
|
)
|
3223
|
+
else:
|
3224
|
+
y_pred_proba = None # No probability output for certain models
|
3225
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3226
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3227
|
+
if hasattr(best_clf, "alphas_"):
|
3228
|
+
alphas_ = best_clf.alphas_
|
3229
|
+
elif hasattr(best_clf, "alpha_"):
|
3230
|
+
alphas_ = best_clf.alpha_
|
3231
|
+
elif hasattr(best_clf, "Cs_"):
|
3232
|
+
alphas_ = best_clf.Cs_
|
3233
|
+
else:
|
3234
|
+
alphas_= None
|
3235
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3048
3236
|
else:
|
3049
|
-
|
3237
|
+
gs = GridSearchCV(
|
3238
|
+
clf,
|
3239
|
+
param_grid=param_grids.get(name, {}),
|
3240
|
+
scoring=(
|
3241
|
+
"roc_auc_ovr"
|
3242
|
+
if purpose == "classification"
|
3243
|
+
else "neg_mean_squared_error"
|
3244
|
+
),
|
3245
|
+
cv=cv,
|
3246
|
+
n_jobs=n_jobs,
|
3247
|
+
verbose=verbose,
|
3248
|
+
)
|
3050
3249
|
|
3250
|
+
# Fit GridSearchCV
|
3251
|
+
gs.fit(x_train, y_train)
|
3252
|
+
best_clf = gs.best_estimator_
|
3253
|
+
|
3254
|
+
# Ensure x_true aligns with x_train columns
|
3255
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3256
|
+
|
3257
|
+
# do i need to fit the x_train, y_train again?
|
3258
|
+
best_clf=best_clf.fit(x_train, y_train)
|
3259
|
+
y_pred = best_clf.predict(x_true)
|
3260
|
+
|
3261
|
+
# Handle prediction probabilities for multiclass
|
3262
|
+
if hasattr(best_clf, "predict_proba"):
|
3263
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3264
|
+
elif hasattr(best_clf, "decision_function"):
|
3265
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3266
|
+
|
3267
|
+
# Normalize for multiclass if necessary
|
3268
|
+
if y_pred_proba.ndim == 2:
|
3269
|
+
y_pred_proba = (
|
3270
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3271
|
+
) / (
|
3272
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3273
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3274
|
+
)
|
3275
|
+
else:
|
3276
|
+
y_pred_proba = None # No probability output for certain models
|
3277
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3278
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3279
|
+
if hasattr(best_clf, "alphas_"):
|
3280
|
+
alphas_ = best_clf.alphas_
|
3281
|
+
elif hasattr(best_clf, "alpha_"):
|
3282
|
+
alphas_ = best_clf.alpha_
|
3283
|
+
elif hasattr(best_clf, "Cs_"):
|
3284
|
+
alphas_ = best_clf.Cs_
|
3285
|
+
else:
|
3286
|
+
alphas_= None
|
3287
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3288
|
+
except Exception as e:
|
3289
|
+
alphas_,coef_ = None,None
|
3290
|
+
print(f"skiped {clf}: {e}")
|
3291
|
+
continue
|
3292
|
+
# try to make predict format consistant
|
3293
|
+
try:
|
3294
|
+
y_pred= [i[0] for i in y_pred]
|
3295
|
+
except:
|
3296
|
+
pass
|
3297
|
+
try:
|
3298
|
+
y_true= [i[0] for i in y_true]
|
3299
|
+
except:
|
3300
|
+
pass
|
3301
|
+
try:
|
3302
|
+
y_train= [i[0] for i in y_train]
|
3303
|
+
except:
|
3304
|
+
pass
|
3051
3305
|
validation_scores = {}
|
3052
3306
|
|
3053
3307
|
if y_true is not None and y_pred_proba is not None:
|
@@ -3097,20 +3351,26 @@ def predict(
|
|
3097
3351
|
"roc_curve": roc_info,
|
3098
3352
|
"pr_curve": pr_info,
|
3099
3353
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3100
|
-
"predictions": y_pred
|
3354
|
+
"predictions": y_pred,#.tolist(),
|
3101
3355
|
"predictions_proba": (
|
3102
3356
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3103
3357
|
),
|
3358
|
+
"features":share_col_names,
|
3359
|
+
"coef":coef_,
|
3360
|
+
"alphas":alphas_
|
3104
3361
|
}
|
3105
3362
|
else: # "regression"
|
3106
3363
|
results[name] = {
|
3107
3364
|
"best_clf": gs.best_estimator_,
|
3108
3365
|
"best_params": gs.best_params_,
|
3109
3366
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3110
|
-
"predictions": y_pred
|
3367
|
+
"predictions": y_pred,#.tolist(),
|
3111
3368
|
"predictions_proba": (
|
3112
3369
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3113
3370
|
),
|
3371
|
+
"features":share_col_names,
|
3372
|
+
"coef":coef_,
|
3373
|
+
"alphas":alphas_
|
3114
3374
|
}
|
3115
3375
|
else: # multi-classes
|
3116
3376
|
if y_pred_proba is not None:
|
@@ -3149,20 +3409,26 @@ def predict(
|
|
3149
3409
|
"roc_curve": roc_info,
|
3150
3410
|
"pr_curve": pr_info,
|
3151
3411
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3152
|
-
"predictions": y_pred
|
3412
|
+
"predictions": y_pred,#.tolist(),
|
3153
3413
|
"predictions_proba": (
|
3154
3414
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3155
3415
|
),
|
3416
|
+
"features":share_col_names,
|
3417
|
+
"coef":coef_,
|
3418
|
+
"alphas":alphas_
|
3156
3419
|
}
|
3157
3420
|
else: # "regression"
|
3158
3421
|
results[name] = {
|
3159
3422
|
"best_clf": gs.best_estimator_,
|
3160
3423
|
"best_params": gs.best_params_,
|
3161
3424
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3162
|
-
"predictions": y_pred
|
3425
|
+
"predictions": y_pred,#.tolist(),
|
3163
3426
|
"predictions_proba": (
|
3164
3427
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3165
3428
|
),
|
3429
|
+
"features":share_col_names,
|
3430
|
+
"coef":coef_,
|
3431
|
+
"alphas":alphas_
|
3166
3432
|
}
|
3167
3433
|
|
3168
3434
|
else:
|
@@ -3181,23 +3447,32 @@ def predict(
|
|
3181
3447
|
"best_clf": gs.best_estimator_,
|
3182
3448
|
"best_params": gs.best_params_,
|
3183
3449
|
"scores": validation_scores,
|
3184
|
-
"predictions": y_pred
|
3450
|
+
"predictions": y_pred,#.tolist(),
|
3185
3451
|
"predictions_proba": (
|
3186
3452
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3187
3453
|
),
|
3454
|
+
"features":share_col_names,
|
3188
3455
|
"y_train": y_train if y_train is not None else [],
|
3189
3456
|
"y_true": y_true if y_true is not None else [],
|
3457
|
+
"coef":coef_,
|
3458
|
+
"alphas":alphas_
|
3190
3459
|
}
|
3191
3460
|
|
3192
3461
|
# Convert results to DataFrame
|
3193
3462
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3194
3463
|
# sort
|
3195
|
-
if y_true is not None
|
3196
|
-
|
3197
|
-
|
3198
|
-
|
3464
|
+
if y_true is not None:
|
3465
|
+
if purpose == "classification":
|
3466
|
+
df_scores = pd.DataFrame(
|
3467
|
+
df_results["scores"].tolist(), index=df_results["scores"].index
|
3468
|
+
).sort_values(by="roc_auc", ascending=False)
|
3469
|
+
elif purpose=='regression':
|
3470
|
+
df_scores = rank_models_reg(
|
3471
|
+
pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
|
3472
|
+
ascending=False)
|
3199
3473
|
df_results = df_results.loc[df_scores.index]
|
3200
3474
|
|
3475
|
+
if y_true is not None and purpose == "classification":
|
3201
3476
|
if plot_:
|
3202
3477
|
from datetime import datetime
|
3203
3478
|
|
@@ -3215,18 +3490,565 @@ def predict(
|
|
3215
3490
|
plot.figsets(xangle=30)
|
3216
3491
|
if dir_save:
|
3217
3492
|
ips.figsave(dir_save + f"scores_clus{now_}.pdf")
|
3493
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3494
|
+
# # try:
|
3495
|
+
# if len(models) > 3:
|
3496
|
+
# plot_validate_features(df_results, is_binary=is_binary)
|
3497
|
+
# else:
|
3498
|
+
# plot_validate_features_single(df_results, is_binary=is_binary)
|
3499
|
+
# if dir_save:
|
3500
|
+
# ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3501
|
+
# # except Exception as e:
|
3502
|
+
# # print(f"Error: 在画图的过程中出现了问题:{e}")
|
3503
|
+
if stack:
|
3504
|
+
#! stacking classifier/regressor
|
3505
|
+
from sklearn.metrics import make_scorer, accuracy_score
|
3506
|
+
from sklearn.model_selection import cross_val_score
|
3507
|
+
|
3508
|
+
#* n_top_models防止超过index
|
3509
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3510
|
+
|
3511
|
+
#* 选择出排名靠前的n个, estimators
|
3512
|
+
models_selecte = select_top_models(models=list(df_results.index),
|
3513
|
+
categories=models_support[purpose],
|
3514
|
+
n_top_models=n_top_models,
|
3515
|
+
n_models_per_category=n_models_per_category)
|
3516
|
+
top_models = df_results.loc[models_selecte]["best_clf"]
|
3517
|
+
base_estimators = []
|
3518
|
+
for i, j in top_models.to_dict().items():
|
3519
|
+
base_estimators.append((i, j))
|
3520
|
+
if stacking_cv:
|
3521
|
+
print(f"⤵ stacking_cv is processing...")
|
3522
|
+
#* 定义几个象征性的final_estimator
|
3523
|
+
# 备选的几种
|
3524
|
+
if purpose == "classification":
|
3525
|
+
kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
|
3526
|
+
else:
|
3527
|
+
kadt_estimators=["XGBoost","LassoCV"]
|
3528
|
+
final_estimators={}
|
3529
|
+
for name in kadt_estimators:
|
3530
|
+
param_grid=param_grids.get(name, {})
|
3531
|
+
print(param_grid)
|
3532
|
+
if is_binary:
|
3533
|
+
gs = GridSearchCV(
|
3534
|
+
model_[name],
|
3535
|
+
param_grid=param_grid,
|
3536
|
+
scoring=(
|
3537
|
+
"roc_auc"
|
3538
|
+
if purpose == "classification"
|
3539
|
+
else "neg_mean_squared_error"
|
3540
|
+
),
|
3541
|
+
cv=cv,
|
3542
|
+
n_jobs=n_jobs,
|
3543
|
+
verbose=verbose,
|
3544
|
+
)
|
3545
|
+
else:
|
3546
|
+
gs = GridSearchCV(
|
3547
|
+
model_[name],
|
3548
|
+
param_grid=param_grid,
|
3549
|
+
scoring=(
|
3550
|
+
"roc_auc_ovr"
|
3551
|
+
if purpose == "classification"
|
3552
|
+
else "neg_mean_squared_error"
|
3553
|
+
),
|
3554
|
+
cv=cv,
|
3555
|
+
n_jobs=n_jobs,
|
3556
|
+
verbose=verbose,
|
3557
|
+
)
|
3558
|
+
# Fit GridSearchCV
|
3559
|
+
gs.fit(x_train, y_train)
|
3560
|
+
final_estimators[name]=gs.best_estimator_
|
3561
|
+
|
3562
|
+
#* Set up cross-validation and performance evaluation
|
3563
|
+
scorer = make_scorer(accuracy_score)
|
3564
|
+
cv_results = []
|
3565
|
+
|
3566
|
+
#*Cross-validate stacking models with different final estimators
|
3567
|
+
for final_name, final_estimator in final_estimators.items():
|
3568
|
+
print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
|
3569
|
+
if purpose == "classification":
|
3570
|
+
stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
|
3571
|
+
else:
|
3572
|
+
stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
|
3573
|
+
|
3574
|
+
scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
|
3575
|
+
|
3576
|
+
# Store the result
|
3577
|
+
cv_results.append({
|
3578
|
+
'final_estimator':final_estimator,
|
3579
|
+
'Final Estimator': final_name,
|
3580
|
+
'Mean Accuracy': np.mean(scores),
|
3581
|
+
'Standard Deviation': np.std(scores)
|
3582
|
+
})
|
3583
|
+
|
3584
|
+
#* Convert the results into a DataFrame for easy comparison
|
3585
|
+
cv_results_df = pd.DataFrame(cv_results)
|
3586
|
+
|
3587
|
+
#* Sort and display the best model
|
3588
|
+
cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
|
3589
|
+
|
3590
|
+
|
3591
|
+
# Optionally: Select the final estimator that gives the best performance
|
3592
|
+
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3593
|
+
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3594
|
+
else:
|
3595
|
+
print(f"⤵ trying to find the best_final_estimator for stacking...")
|
3596
|
+
if purpose=="classification":
|
3597
|
+
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3598
|
+
random_state=random_state,
|
3599
|
+
max_iter=1000)
|
3600
|
+
else:
|
3601
|
+
best_final_estimator = RidgeCV(cv=5)
|
3602
|
+
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3603
|
+
#! apply stacking
|
3604
|
+
if purpose == "classification":
|
3605
|
+
print(f"⤵ StackingClassifier...")
|
3606
|
+
stacking_model = StackingClassifier(estimators=base_estimators,
|
3607
|
+
final_estimator=best_final_estimator,
|
3608
|
+
cv=cv)
|
3609
|
+
else:
|
3610
|
+
print(f"⤵ StackingRegressor...")
|
3611
|
+
stacking_model = StackingRegressor(estimators=base_estimators,
|
3612
|
+
final_estimator=best_final_estimator,
|
3613
|
+
cv=cv)
|
3614
|
+
|
3615
|
+
# Train the Stacking Classifier
|
3616
|
+
print(f"⤵ fit & predict...")
|
3617
|
+
stacking_model.fit(x_train, y_train)
|
3618
|
+
y_pred_final = stacking_model.predict(x_true)
|
3619
|
+
print(f"⤵ collecting results...")
|
3620
|
+
# pred_proba
|
3621
|
+
if is_binary:
|
3622
|
+
if hasattr(stacking_model, "predict_proba"):
|
3623
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3624
|
+
if y_pred_proba_final.shape[1] == 1:
|
3625
|
+
y_pred_proba_final = np.hstack(
|
3626
|
+
[1 - y_pred_proba_final, y_pred_proba_final]
|
3627
|
+
) # Add missing class probabilities
|
3628
|
+
y_pred_proba_final = y_pred_proba_final[:, 1]
|
3629
|
+
elif hasattr(stacking_model, "decision_function"):
|
3630
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3631
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3632
|
+
# Ensure y_pred_proba_final is within 0 and 1 bounds
|
3633
|
+
y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
|
3634
|
+
y_pred_proba_final.max() - y_pred_proba_final.min()
|
3635
|
+
)
|
3636
|
+
else:
|
3637
|
+
y_pred_proba_final = None # No probability output for certain models
|
3638
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3639
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3640
|
+
if hasattr(best_clf, "alphas_"):
|
3641
|
+
alphas_ = best_clf.alphas_
|
3642
|
+
elif hasattr(best_clf, "alpha_"):
|
3643
|
+
alphas_ = best_clf.alpha_
|
3644
|
+
elif hasattr(best_clf, "Cs_"):
|
3645
|
+
alphas_ = best_clf.Cs_
|
3646
|
+
else:
|
3647
|
+
alphas_= None
|
3648
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3649
|
+
if not is_binary:
|
3650
|
+
# Handle prediction probabilities for multiclass
|
3651
|
+
if hasattr(stacking_model, "predict_proba"):
|
3652
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3653
|
+
elif hasattr(stacking_model, "decision_function"):
|
3654
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3655
|
+
|
3656
|
+
# Normalize for multiclass if necessary
|
3657
|
+
if y_pred_proba_final.ndim == 2:
|
3658
|
+
y_pred_proba_final = (
|
3659
|
+
y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
|
3660
|
+
) / (
|
3661
|
+
y_pred_proba_final.max(axis=1, keepdims=True)
|
3662
|
+
- y_pred_proba_final.min(axis=1, keepdims=True)
|
3663
|
+
)
|
3664
|
+
else:
|
3665
|
+
y_pred_proba_final = None # No probability output for certain models
|
3666
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3667
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3668
|
+
if hasattr(best_clf, "alphas_"):
|
3669
|
+
alphas_ = best_clf.alphas_
|
3670
|
+
elif hasattr(best_clf, "alpha_"):
|
3671
|
+
alphas_ = best_clf.alpha_
|
3672
|
+
elif hasattr(best_clf, "Cs_"):
|
3673
|
+
alphas_ = best_clf.Cs_
|
3674
|
+
else:
|
3675
|
+
alphas_= None
|
3676
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3677
|
+
#! dict_pred_stack
|
3678
|
+
dict_pred_stack={}
|
3679
|
+
validation_scores_final = {}
|
3680
|
+
if y_true is not None and y_pred_proba_final is not None:
|
3681
|
+
validation_scores_final = cal_metrics(
|
3682
|
+
y_true,
|
3683
|
+
y_pred_final,
|
3684
|
+
y_pred_proba=y_pred_proba_final,
|
3685
|
+
is_binary=is_binary,
|
3686
|
+
purpose=purpose,
|
3687
|
+
average="weighted",
|
3688
|
+
)
|
3689
|
+
if is_binary:
|
3690
|
+
# Calculate ROC curve
|
3691
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
3692
|
+
if y_pred_proba_final is not None:
|
3693
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3694
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3695
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3696
|
+
)
|
3697
|
+
roc_auc = auc(fpr, tpr)
|
3698
|
+
roc_info = {
|
3699
|
+
"fpr": fpr.tolist(),
|
3700
|
+
"tpr": tpr.tolist(),
|
3701
|
+
"auc": roc_auc,
|
3702
|
+
"ci95": (lower_ci, upper_ci),
|
3703
|
+
}
|
3704
|
+
# precision-recall curve
|
3705
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
|
3706
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
|
3707
|
+
pr_info = {
|
3708
|
+
"precision": precision_,
|
3709
|
+
"recall": recall_,
|
3710
|
+
"avg_precision": avg_precision_,
|
3711
|
+
}
|
3712
|
+
else:
|
3713
|
+
roc_info, pr_info = None, None
|
3714
|
+
if purpose == "classification":
|
3715
|
+
dict_pred_stack = {
|
3716
|
+
"best_clf": stacking_model,
|
3717
|
+
"best_params": None,
|
3718
|
+
"auc_indiv": None,
|
3719
|
+
"scores": validation_scores_final,
|
3720
|
+
"roc_curve": roc_info,
|
3721
|
+
"pr_curve": pr_info,
|
3722
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3723
|
+
"predictions": y_pred_final.tolist(),
|
3724
|
+
"predictions_proba": (
|
3725
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3726
|
+
),
|
3727
|
+
"features":share_col_names,
|
3728
|
+
"coef":coef_,
|
3729
|
+
"alphas":alphas_
|
3730
|
+
}
|
3731
|
+
else: # "regression"
|
3732
|
+
dict_pred_stack = {
|
3733
|
+
"best_clf": stacking_model,
|
3734
|
+
"best_params": None,
|
3735
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3736
|
+
"predictions": y_pred_final.tolist(),
|
3737
|
+
"predictions_proba": (
|
3738
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3739
|
+
),
|
3740
|
+
"features":share_col_names,
|
3741
|
+
"coef":coef_,
|
3742
|
+
"alphas":alphas_
|
3743
|
+
}
|
3744
|
+
else: # multi-classes
|
3745
|
+
if y_pred_proba_final is not None:
|
3746
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3747
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3748
|
+
confidence_intervals = cal_auc_ci(
|
3749
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3750
|
+
)
|
3751
|
+
roc_info = {
|
3752
|
+
"fpr": validation_scores_final["fpr"],
|
3753
|
+
"tpr": validation_scores_final["tpr"],
|
3754
|
+
"auc": validation_scores_final["roc_auc_by_class"],
|
3755
|
+
"ci95": confidence_intervals,
|
3756
|
+
}
|
3757
|
+
# precision-recall curve
|
3758
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3759
|
+
y_true, y_pred_proba_final, is_binary=is_binary
|
3760
|
+
)
|
3761
|
+
pr_info = {
|
3762
|
+
"precision": precision_,
|
3763
|
+
"recall": recall_,
|
3764
|
+
"avg_precision": avg_precision_,
|
3765
|
+
}
|
3766
|
+
else:
|
3767
|
+
roc_info, pr_info = None, None
|
3768
|
+
|
3769
|
+
if purpose == "classification":
|
3770
|
+
dict_pred_stack = {
|
3771
|
+
"best_clf": stacking_model,
|
3772
|
+
"best_params": None,
|
3773
|
+
"auc_indiv": None,
|
3774
|
+
"scores": validation_scores_final,
|
3775
|
+
"roc_curve": roc_info,
|
3776
|
+
"pr_curve": pr_info,
|
3777
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3778
|
+
"predictions": y_pred_final.tolist(),
|
3779
|
+
"predictions_proba": (
|
3780
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3781
|
+
),
|
3782
|
+
"features":share_col_names,
|
3783
|
+
"coef":coef_,
|
3784
|
+
"alphas":alphas_
|
3785
|
+
}
|
3786
|
+
else: # "regression"
|
3787
|
+
dict_pred_stack = {
|
3788
|
+
"best_clf": stacking_model,
|
3789
|
+
"best_params": None,
|
3790
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3791
|
+
"predictions": y_pred_final.tolist(),
|
3792
|
+
"predictions_proba": (
|
3793
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3794
|
+
),
|
3795
|
+
"features":share_col_names,
|
3796
|
+
"coef":coef_,
|
3797
|
+
"alphas":alphas_
|
3798
|
+
}
|
3799
|
+
|
3800
|
+
else:
|
3801
|
+
if y_true is None:
|
3802
|
+
validation_scores_final = []
|
3803
|
+
else:
|
3804
|
+
validation_scores_final = cal_metrics(
|
3805
|
+
y_true,
|
3806
|
+
y_pred,
|
3807
|
+
y_pred_proba=y_pred_proba_final,
|
3808
|
+
is_binary=is_binary,
|
3809
|
+
purpose=purpose,
|
3810
|
+
average="weighted",
|
3811
|
+
)
|
3812
|
+
dict_pred_stack = {
|
3813
|
+
"best_clf": stacking_model,
|
3814
|
+
"best_params": None,
|
3815
|
+
"scores": validation_scores_final,
|
3816
|
+
"predictions": y_pred_final.tolist(),
|
3817
|
+
"predictions_proba": (
|
3818
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3819
|
+
),
|
3820
|
+
"features":share_col_names,
|
3821
|
+
"y_train": y_train if y_train is not None else [],
|
3822
|
+
"y_true": y_true if y_true is not None else [],
|
3823
|
+
"coef":coef_,
|
3824
|
+
"alphas":alphas_
|
3825
|
+
}
|
3826
|
+
# merge together
|
3827
|
+
df_pred = pd.DataFrame(
|
3828
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
|
3829
|
+
for k, v in dict_pred_stack.items():
|
3830
|
+
if k in df_pred.columns:
|
3831
|
+
df_pred[k] = [v]
|
3832
|
+
|
3833
|
+
# # plot the stacking
|
3834
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3835
|
+
# plot_validate_features_single(df_pred, is_binary=is_binary)
|
3836
|
+
# if dir_save:
|
3837
|
+
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3838
|
+
if vote:
|
3839
|
+
print(f"⤵ voting...")
|
3840
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3841
|
+
#! voting
|
3842
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3843
|
+
base_estimators=[]
|
3844
|
+
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3845
|
+
base_estimators.append((name,cls))
|
3846
|
+
# Apply Voting Classifier/Regressor
|
3847
|
+
if purpose == "classification":
|
3848
|
+
print(f"⤵ VotingClassifier...via{voting}")
|
3849
|
+
if voting=='hard':
|
3850
|
+
# Hard voting does not support `predict_proba`
|
3851
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3852
|
+
else:
|
3853
|
+
# Soft voting supports `predict_proba`
|
3854
|
+
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3855
|
+
else:
|
3856
|
+
print(f"⤵ VotingRegressor...")
|
3857
|
+
voting_model = VotingRegressor(estimators=base_estimators)
|
3858
|
+
|
3859
|
+
# Train the Voting Classifier/Regressor
|
3860
|
+
try:
|
3861
|
+
voting_model.fit(x_train, y_train)
|
3862
|
+
y_pred_vote = voting_model.predict(x_true)
|
3863
|
+
except Exception as e:
|
3864
|
+
if purpose == "classification" and not voting=='hard':
|
3865
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3866
|
+
voting_model.fit(x_train, y_train)
|
3867
|
+
y_pred_vote = voting_model.predict(x_true)
|
3868
|
+
|
3869
|
+
# Calculate predicted probabilities if applicable
|
3870
|
+
if purpose == "classification":
|
3871
|
+
if hasattr(voting_model, "predict_proba"):
|
3872
|
+
y_pred_proba_vote = voting_model.predict_proba(x_true)
|
3873
|
+
print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
|
3874
|
+
if y_pred_proba_vote.shape[1] == 1:
|
3875
|
+
y_pred_proba_vote = np.hstack(
|
3876
|
+
[1 - y_pred_proba_vote, y_pred_proba_vote]
|
3877
|
+
) # Add missing class probabilities
|
3878
|
+
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3879
|
+
else:
|
3880
|
+
y_pred_proba_vote = None
|
3881
|
+
|
3882
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3883
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3884
|
+
if hasattr(best_clf, "alphas_"):
|
3885
|
+
alphas_ = best_clf.alphas_
|
3886
|
+
elif hasattr(best_clf, "alpha_"):
|
3887
|
+
alphas_ = best_clf.alpha_
|
3888
|
+
elif hasattr(best_clf, "Cs_"):
|
3889
|
+
alphas_ = best_clf.Cs_
|
3890
|
+
else:
|
3891
|
+
alphas_= None
|
3892
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3893
|
+
else: # Regression
|
3894
|
+
y_pred_proba_vote = None
|
3895
|
+
coef_,alphas_=None,None
|
3896
|
+
|
3897
|
+
print(f"⤵ collecting voting results...")
|
3898
|
+
#! dict_pred_vote
|
3899
|
+
dict_pred_vote = {}
|
3900
|
+
validation_scores_vote = {}
|
3901
|
+
if y_true is not None and y_pred_proba_vote is not None:
|
3902
|
+
validation_scores_vote = cal_metrics(
|
3903
|
+
y_true,
|
3904
|
+
y_pred_vote,
|
3905
|
+
y_pred_proba=y_pred_proba_vote,
|
3906
|
+
is_binary=is_binary,
|
3907
|
+
purpose=purpose,
|
3908
|
+
average="weighted",
|
3909
|
+
)
|
3910
|
+
|
3911
|
+
if is_binary:
|
3912
|
+
if y_pred_proba_vote is not None:
|
3913
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
|
3914
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3915
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3916
|
+
)
|
3917
|
+
roc_auc = auc(fpr, tpr)
|
3918
|
+
roc_info = {
|
3919
|
+
"fpr": fpr.tolist(),
|
3920
|
+
"tpr": tpr.tolist(),
|
3921
|
+
"auc": roc_auc,
|
3922
|
+
"ci95": (lower_ci, upper_ci),
|
3923
|
+
}
|
3924
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
|
3925
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
|
3926
|
+
pr_info = {
|
3927
|
+
"precision": precision_,
|
3928
|
+
"recall": recall_,
|
3929
|
+
"avg_precision": avg_precision_,
|
3930
|
+
}
|
3931
|
+
else:
|
3932
|
+
roc_info, pr_info = None, None
|
3933
|
+
|
3934
|
+
dict_pred_vote = {
|
3935
|
+
"best_clf": voting_model,
|
3936
|
+
"best_params": None,
|
3937
|
+
"auc_indiv": None,
|
3938
|
+
"scores": validation_scores_vote,
|
3939
|
+
"roc_curve": roc_info,
|
3940
|
+
"pr_curve": pr_info,
|
3941
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3942
|
+
"predictions": y_pred_vote.tolist(),
|
3943
|
+
"predictions_proba": (
|
3944
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3945
|
+
),
|
3946
|
+
"features":share_col_names,
|
3947
|
+
"coef":coef_,
|
3948
|
+
"alphas":alphas_
|
3949
|
+
}
|
3950
|
+
else: # Multi-class
|
3951
|
+
if y_pred_proba_vote is not None:
|
3952
|
+
confidence_intervals = cal_auc_ci(
|
3953
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3954
|
+
)
|
3955
|
+
roc_info = {
|
3956
|
+
"fpr": validation_scores_vote["fpr"],
|
3957
|
+
"tpr": validation_scores_vote["tpr"],
|
3958
|
+
"auc": validation_scores_vote["roc_auc_by_class"],
|
3959
|
+
"ci95": confidence_intervals,
|
3960
|
+
}
|
3961
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3962
|
+
y_true, y_pred_proba_vote, is_binary=is_binary
|
3963
|
+
)
|
3964
|
+
pr_info = {
|
3965
|
+
"precision": precision_,
|
3966
|
+
"recall": recall_,
|
3967
|
+
"avg_precision": avg_precision_,
|
3968
|
+
}
|
3969
|
+
else:
|
3970
|
+
roc_info, pr_info = None, None
|
3971
|
+
|
3972
|
+
dict_pred_vote = {
|
3973
|
+
"best_clf": voting_model,
|
3974
|
+
"best_params": None,
|
3975
|
+
"scores": validation_scores_vote,
|
3976
|
+
"roc_curve": roc_info,
|
3977
|
+
"pr_curve": pr_info,
|
3978
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3979
|
+
"predictions": y_pred_vote.tolist(),
|
3980
|
+
"predictions_proba": (
|
3981
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3982
|
+
),
|
3983
|
+
"features":share_col_names,
|
3984
|
+
"coef":coef_,
|
3985
|
+
"alphas":alphas_
|
3986
|
+
}
|
3987
|
+
else:
|
3988
|
+
if y_true is None:
|
3989
|
+
validation_scores_vote = []
|
3990
|
+
else:
|
3991
|
+
validation_scores_vote = cal_metrics(
|
3992
|
+
y_true,
|
3993
|
+
y_pred,
|
3994
|
+
y_pred_proba=y_pred_proba_vote,
|
3995
|
+
is_binary=is_binary,
|
3996
|
+
purpose=purpose,
|
3997
|
+
average="weighted",
|
3998
|
+
)
|
3999
|
+
dict_pred_vote = {
|
4000
|
+
"best_clf": voting_model,
|
4001
|
+
"best_params": None,
|
4002
|
+
"scores": validation_scores_vote,
|
4003
|
+
"predictions": y_pred_vote.tolist(),
|
4004
|
+
"predictions_proba": (
|
4005
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
4006
|
+
),
|
4007
|
+
"features":share_col_names,
|
4008
|
+
"y_train": y_train if y_train is not None else [],
|
4009
|
+
"y_true": y_true if y_true is not None else [],
|
4010
|
+
}
|
4011
|
+
df_vote = pd.DataFrame(
|
4012
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
|
4013
|
+
for k, v in dict_pred_vote.items():
|
4014
|
+
if k in df_vote.columns:
|
4015
|
+
df_vote[k] = [v]
|
4016
|
+
|
4017
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
4018
|
+
# try:
|
4019
|
+
# plot_validate_features_single(df_vote, is_binary=is_binary)
|
4020
|
+
# if dir_save:
|
4021
|
+
# ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
|
4022
|
+
# except Exception as e:
|
4023
|
+
# print(e)
|
4024
|
+
print("Done")
|
4025
|
+
if vote and stack:
|
4026
|
+
df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
|
4027
|
+
elif vote:
|
4028
|
+
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
4029
|
+
elif stack:
|
4030
|
+
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
4031
|
+
|
3218
4032
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
4033
|
+
from datetime import datetime
|
4034
|
+
|
4035
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3219
4036
|
# try:
|
3220
|
-
if
|
3221
|
-
|
4037
|
+
if df_res.shape[0] > 3:
|
4038
|
+
try:
|
4039
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
4040
|
+
except Exception as e:
|
4041
|
+
print(e)
|
3222
4042
|
else:
|
3223
|
-
|
4043
|
+
try:
|
4044
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
4045
|
+
except Exception as e:
|
4046
|
+
print(e)
|
3224
4047
|
if dir_save:
|
3225
4048
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3226
|
-
|
3227
|
-
|
3228
|
-
return
|
3229
|
-
|
4049
|
+
# except Exception as e:
|
4050
|
+
# print(f"Error: 在画图的过程中出现了问题:{e}")
|
4051
|
+
return df_res
|
3230
4052
|
|
3231
4053
|
def cal_metrics(
|
3232
4054
|
y_true,
|
@@ -3368,7 +4190,7 @@ def cal_metrics(
|
|
3368
4190
|
|
3369
4191
|
|
3370
4192
|
def plot_trees(
|
3371
|
-
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
4193
|
+
X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3372
4194
|
):
|
3373
4195
|
"""
|
3374
4196
|
# # Example usage:
|
@@ -3414,10 +4236,14 @@ def plot_trees(
|
|
3414
4236
|
train_error_rate = []
|
3415
4237
|
test_error_rate = []
|
3416
4238
|
validation_error = None
|
3417
|
-
|
4239
|
+
if isinstance(cls, str):
|
4240
|
+
cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
|
3418
4241
|
# Configure classifier based on type
|
3419
4242
|
oob_enabled = False # Default to no OOB error unless explicitly set
|
3420
|
-
|
4243
|
+
clf_support = {"RandomForestClassifier":RandomForestClassifier(),
|
4244
|
+
"ExtraTreesClassifier":ExtraTreesClassifier(),
|
4245
|
+
"AdaBoostClassifier":AdaBoostClassifier(),
|
4246
|
+
"GradientBoostingClassifier":GradientBoostingClassifier()}
|
3421
4247
|
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3422
4248
|
# Enable OOB if cls supports it and is using bootstrapping
|
3423
4249
|
cls.set_params(warm_start=True, n_estimators=1)
|
@@ -3679,7 +4505,7 @@ def img_datasets_preprocessing(
|
|
3679
4505
|
|
3680
4506
|
|
3681
4507
|
def backward_regression(
|
3682
|
-
X: pd.DataFrame, y: pd.Series, initial_list=[],
|
4508
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
|
3683
4509
|
):
|
3684
4510
|
"""
|
3685
4511
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
@@ -3691,31 +4517,46 @@ def backward_regression(
|
|
3691
4517
|
X -- features values
|
3692
4518
|
y -- target variable
|
3693
4519
|
initial_list -- features header
|
3694
|
-
|
4520
|
+
thr -- pvalue threshold of features to drop
|
3695
4521
|
verbose -- true to produce lots of logging output
|
3696
4522
|
|
3697
4523
|
Returns:
|
3698
4524
|
list of selected features for modeling
|
3699
4525
|
"""
|
3700
4526
|
import statsmodels.api as sm
|
3701
|
-
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
3705
|
-
|
4527
|
+
if isinstance(y, str):
|
4528
|
+
if y in X.columns:
|
4529
|
+
y_col_name = y
|
4530
|
+
y = X[y]
|
4531
|
+
X = X.drop(y_col_name, axis=1)
|
4532
|
+
else:
|
4533
|
+
raise ValueError(f"找不到{y},y设置有误")
|
4534
|
+
X = X.select_dtypes(include=[np.number])
|
4535
|
+
|
3706
4536
|
included = list(X.columns)
|
4537
|
+
try:
|
4538
|
+
X=X.astype(float)
|
4539
|
+
y=y.astype(float)
|
4540
|
+
except Exception as e:
|
4541
|
+
raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
|
4542
|
+
|
4543
|
+
|
3707
4544
|
while True:
|
3708
4545
|
changed = False
|
4546
|
+
if not included:
|
4547
|
+
print("No features remain in the model.")
|
4548
|
+
break
|
4549
|
+
|
3709
4550
|
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3710
4551
|
# exclude the intercept for p-value checking
|
3711
4552
|
pvalues = model.pvalues.iloc[1:]
|
3712
4553
|
worst_pval = pvalues.max()
|
3713
|
-
if worst_pval >
|
4554
|
+
if worst_pval > thr:
|
3714
4555
|
changed = True
|
3715
4556
|
worst_feature = pvalues.idxmax()
|
3716
4557
|
included.remove(worst_feature)
|
3717
4558
|
if verbose:
|
3718
|
-
print(f"Removing
|
4559
|
+
print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
|
3719
4560
|
if not changed:
|
3720
4561
|
break
|
3721
4562
|
print(f"\nSelected Features:\n{included}")
|