py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +297 -229
- py2ls/ml2ls.py +996 -155
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +15 -11
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
|
|
5
5
|
BaggingClassifier,
|
6
6
|
)
|
7
7
|
from sklearn.svm import SVC, SVR
|
8
|
-
from sklearn.calibration import CalibratedClassifierCV
|
9
8
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
10
9
|
from sklearn.linear_model import (
|
11
10
|
LassoCV,
|
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
|
|
16
15
|
RidgeClassifierCV,
|
17
16
|
ElasticNet,
|
18
17
|
)
|
19
|
-
|
20
|
-
from sklearn.naive_bayes import GaussianNB
|
21
|
-
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
22
|
-
import xgboost as xgb # Make sure you have xgboost installed
|
23
|
-
|
24
|
-
from sklearn.model_selection import train_test_split, cross_val_score
|
18
|
+
|
25
19
|
from sklearn.metrics import (
|
26
20
|
accuracy_score,
|
27
21
|
precision_score,
|
@@ -36,18 +30,12 @@ from sklearn.metrics import (
|
|
36
30
|
precision_recall_curve,
|
37
31
|
average_precision_score,
|
38
32
|
)
|
39
|
-
from imblearn.over_sampling import SMOTE
|
40
|
-
from sklearn.pipeline import Pipeline
|
41
|
-
from collections import defaultdict
|
42
|
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
43
33
|
from typing import Dict, Any, Optional, List, Union
|
44
34
|
import numpy as np
|
45
35
|
import pandas as pd
|
46
36
|
from . import ips
|
47
37
|
from . import plot
|
48
38
|
import matplotlib.pyplot as plt
|
49
|
-
import seaborn as sns
|
50
|
-
|
51
39
|
plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
|
52
40
|
import logging
|
53
41
|
import warnings
|
@@ -314,6 +302,8 @@ def features_svm(
|
|
314
302
|
- Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
|
315
303
|
S-shaped relationships.
|
316
304
|
"""
|
305
|
+
from sklearn.feature_selection import RFE
|
306
|
+
from sklearn.svm import SVC
|
317
307
|
# SVM (Support Vector Machines)
|
318
308
|
svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
|
319
309
|
# RFE(Recursive Feature Elimination)
|
@@ -450,6 +440,7 @@ def validate_classifier(
|
|
450
440
|
Returns:
|
451
441
|
- results: Dictionary containing average cv_train_scores and cv_test_scores.
|
452
442
|
"""
|
443
|
+
from sklearn.model_selection import cross_val_score
|
453
444
|
cv_train_scores = {metric: [] for metric in metrics}
|
454
445
|
skf = StratifiedKFold(n_splits=cv_folds)
|
455
446
|
# Perform cross-validation
|
@@ -982,6 +973,8 @@ def validate_features(
|
|
982
973
|
|
983
974
|
"""
|
984
975
|
from tqdm import tqdm
|
976
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
977
|
+
from sklearn.calibration import CalibratedClassifierCV
|
985
978
|
|
986
979
|
# Ensure common features are selected
|
987
980
|
common_features = ips.shared(
|
@@ -1001,6 +994,7 @@ def validate_features(
|
|
1001
994
|
|
1002
995
|
# Handle class imbalance using SMOTE
|
1003
996
|
if smote:
|
997
|
+
from imblearn.over_sampling import SMOTE
|
1004
998
|
if (
|
1005
999
|
y_train.value_counts(normalize=True).max() < 0.8
|
1006
1000
|
): # Threshold to decide if data is imbalanced
|
@@ -2096,19 +2090,136 @@ def rank_models(
|
|
2096
2090
|
# )
|
2097
2091
|
|
2098
2092
|
# figsave("classifier_performance.pdf")
|
2093
|
+
def rank_models_reg(df, ascending=False):
|
2094
|
+
"""
|
2095
|
+
Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
|
2099
2096
|
|
2097
|
+
Parameters:
|
2098
|
+
df (pd.DataFrame): DataFrame containing the regression metrics.
|
2099
|
+
ascending (bool): Whether to sort in ascending order of ranking score.
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
|
2103
|
+
"""
|
2104
|
+
# Define weights for the 4 metrics
|
2105
|
+
weights = {
|
2106
|
+
"mse": -1, # Lower is better
|
2107
|
+
"rmse": -1, # Lower is better
|
2108
|
+
"mae": -1, # Lower is better
|
2109
|
+
"r2": 1, # Higher is better
|
2110
|
+
}
|
2111
|
+
|
2112
|
+
# Normalize the selected metrics
|
2113
|
+
df = df.copy() # Work on a copy of the DataFrame
|
2114
|
+
for metric, weight in weights.items():
|
2115
|
+
if metric in df.columns:
|
2116
|
+
if weight > 0: # Higher is better; normalize 0-1
|
2117
|
+
df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
|
2118
|
+
df[metric].max() - df[metric].min()
|
2119
|
+
)
|
2120
|
+
else: # Lower is better; reverse normalize 0-1
|
2121
|
+
df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
|
2122
|
+
df[metric].max() - df[metric].min()
|
2123
|
+
)
|
2124
|
+
|
2125
|
+
# Calculate ranking score as a weighted sum
|
2126
|
+
df["Ranking_Score"] = sum(
|
2127
|
+
df[metric + "_normalized"] * abs(weights[metric])
|
2128
|
+
for metric in weights.keys()
|
2129
|
+
if metric + "_normalized" in df.columns
|
2130
|
+
)
|
2131
|
+
|
2132
|
+
# Sort models based on the ranking score
|
2133
|
+
sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
|
2134
|
+
return sorted_df
|
2135
|
+
|
2136
|
+
models_support = {
|
2137
|
+
"classification": {
|
2138
|
+
"Random Forest": "Tree-Based",
|
2139
|
+
"SVM": "Kernel-Based",
|
2140
|
+
"Logistic Regression": "Linear",
|
2141
|
+
"Lasso Logistic Regression": "Linear",
|
2142
|
+
"Gradient Boosting": "Tree-Based",
|
2143
|
+
"XGBoost": "Tree-Based",
|
2144
|
+
"KNN": "Instance-Based",
|
2145
|
+
"Naive Bayes": "Probabilistic",
|
2146
|
+
"Linear Discriminant Analysis": "Linear",
|
2147
|
+
"AdaBoost": "Tree-Based",
|
2148
|
+
"CatBoost": "Tree-Based",
|
2149
|
+
"Extra Trees": "Tree-Based",
|
2150
|
+
"Bagging": "Tree-Based",
|
2151
|
+
"Neural Network": "Neural Network",
|
2152
|
+
"DecisionTree": "Tree-Based",
|
2153
|
+
"Quadratic Discriminant Analysis": "Probabilistic",
|
2154
|
+
"Ridge": "Linear",
|
2155
|
+
"Perceptron": "Linear",
|
2156
|
+
"Bernoulli Naive Bayes": "Probabilistic",
|
2157
|
+
"SGDClassifier": "Linear",
|
2158
|
+
},
|
2159
|
+
"regression": {
|
2160
|
+
"Linear Regression": "Linear",
|
2161
|
+
"Ridge": "Linear",
|
2162
|
+
"RidgeCV": "Linear",
|
2163
|
+
"TheilSenRegressor": "Linear",
|
2164
|
+
"HuberRegressor": "Linear",
|
2165
|
+
"PoissonRegressor": "Linear",
|
2166
|
+
"LassoCV": "Linear",
|
2167
|
+
"Bagging": "Tree-Based",
|
2168
|
+
"ElasticNet": "Linear",
|
2169
|
+
"Random Forest": "Tree-Based",
|
2170
|
+
"Gradient Boosting": "Tree-Based",
|
2171
|
+
"XGBoost": "Tree-Based",
|
2172
|
+
"CatBoost": "Tree-Based",
|
2173
|
+
"Extra Trees": "Tree-Based",
|
2174
|
+
"SVM": "Kernel-Based",
|
2175
|
+
"KNN": "Instance-Based",
|
2176
|
+
"Neural Network": "Neural Network",
|
2177
|
+
"AdaBoost": "Linear",
|
2178
|
+
},
|
2179
|
+
}
|
2180
|
+
def select_top_models(models, categories, n_top_models, n_models_per_category=1):
|
2181
|
+
"""
|
2182
|
+
models = list_sort
|
2183
|
+
purpose = "regression"
|
2184
|
+
categories = models_support[purpose]
|
2185
|
+
n_top_models = 3
|
2186
|
+
select_top_models(models, categories, n_top_models)
|
2187
|
+
"""
|
2188
|
+
selected = {}
|
2189
|
+
result = []
|
2190
|
+
for model in models:
|
2191
|
+
category = categories.get(model, "Unknown")
|
2192
|
+
if category not in selected:
|
2193
|
+
selected[category] = 0 # Initialize counter for the category
|
2194
|
+
|
2195
|
+
if selected[category] < n_models_per_category: # Allow additional models up to the limit
|
2196
|
+
selected[category] += 1
|
2197
|
+
result.append(model)
|
2198
|
+
|
2199
|
+
if len(result) == n_top_models: # Stop when the desired number of models is reached
|
2200
|
+
break
|
2201
|
+
|
2202
|
+
return result
|
2100
2203
|
|
2101
2204
|
def predict(
|
2102
2205
|
x_train: pd.DataFrame,
|
2103
2206
|
y_train: pd.Series,
|
2104
2207
|
x_true: pd.DataFrame = None,
|
2105
2208
|
y_true: Optional[pd.Series] = None,
|
2209
|
+
fill_missing:bool = True,
|
2210
|
+
scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
|
2106
2211
|
backward: bool = False, # backward_regression
|
2212
|
+
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2107
2213
|
common_features: set = None,
|
2108
2214
|
purpose: str = "classification", # 'classification' or 'regression'
|
2109
2215
|
cls: Optional[Dict[str, Any]] = None,
|
2110
2216
|
metrics: Optional[List[str]] = None,
|
2111
|
-
|
2217
|
+
stack:bool=True,# run stacking
|
2218
|
+
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2219
|
+
vote:bool=True,# run voting
|
2220
|
+
voting:str="hard", # only for classification purporse of voting
|
2221
|
+
n_top_models:int=5, #for stacking models
|
2222
|
+
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
2112
2223
|
smote: bool = False,
|
2113
2224
|
n_jobs: int = -1,
|
2114
2225
|
plot_: bool = True,
|
@@ -2117,6 +2228,7 @@ def predict(
|
|
2117
2228
|
cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
|
2118
2229
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2119
2230
|
class_weight: str = "balanced",
|
2231
|
+
random_state: int = 1,
|
2120
2232
|
verbose: bool = False,
|
2121
2233
|
) -> pd.DataFrame:
|
2122
2234
|
"""
|
@@ -2184,10 +2296,17 @@ def predict(
|
|
2184
2296
|
RidgeClassifierCV,
|
2185
2297
|
Perceptron,
|
2186
2298
|
SGDClassifier,
|
2299
|
+
RidgeCV,
|
2300
|
+
Ridge,
|
2301
|
+
TheilSenRegressor,
|
2302
|
+
HuberRegressor,
|
2303
|
+
PoissonRegressor,
|
2304
|
+
|
2187
2305
|
)
|
2306
|
+
from sklearn.compose import TransformedTargetRegressor
|
2188
2307
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
2189
2308
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB
|
2190
|
-
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
2309
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
|
2191
2310
|
import xgboost as xgb
|
2192
2311
|
import lightgbm as lgb
|
2193
2312
|
import catboost as cb
|
@@ -2198,6 +2317,7 @@ def predict(
|
|
2198
2317
|
QuadraticDiscriminantAnalysis,
|
2199
2318
|
)
|
2200
2319
|
from sklearn.preprocessing import PolynomialFeatures
|
2320
|
+
from sklearn.model_selection import train_test_split
|
2201
2321
|
|
2202
2322
|
# 拼写检查
|
2203
2323
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
@@ -2206,7 +2326,7 @@ def predict(
|
|
2206
2326
|
if purpose == "classification":
|
2207
2327
|
model_ = {
|
2208
2328
|
"Random Forest": RandomForestClassifier(
|
2209
|
-
random_state=random_state, class_weight=class_weight
|
2329
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2210
2330
|
),
|
2211
2331
|
# SVC (Support Vector Classification)
|
2212
2332
|
"SVM": SVC(
|
@@ -2217,7 +2337,7 @@ def predict(
|
|
2217
2337
|
),
|
2218
2338
|
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
2219
2339
|
"Logistic Regression": LogisticRegression(
|
2220
|
-
class_weight=class_weight, random_state=random_state
|
2340
|
+
class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
|
2221
2341
|
),
|
2222
2342
|
# Logistic Regression with L1 Regularization (Lasso)
|
2223
2343
|
"Lasso Logistic Regression": LogisticRegression(
|
@@ -2228,53 +2348,54 @@ def predict(
|
|
2228
2348
|
eval_metric="logloss",
|
2229
2349
|
random_state=random_state,
|
2230
2350
|
),
|
2231
|
-
"KNN": KNeighborsClassifier(n_neighbors=5),
|
2351
|
+
"KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
|
2232
2352
|
"Naive Bayes": GaussianNB(),
|
2233
2353
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
2234
2354
|
"AdaBoost": AdaBoostClassifier(
|
2235
2355
|
algorithm="SAMME", random_state=random_state
|
2236
2356
|
),
|
2237
|
-
|
2357
|
+
"LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
|
2238
2358
|
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
2239
2359
|
"Extra Trees": ExtraTreesClassifier(
|
2240
|
-
random_state=random_state, class_weight=class_weight
|
2360
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2241
2361
|
),
|
2242
|
-
"Bagging": BaggingClassifier(random_state=random_state),
|
2362
|
+
"Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
|
2243
2363
|
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
2244
2364
|
"DecisionTree": DecisionTreeClassifier(),
|
2245
2365
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
2246
2366
|
"Ridge": RidgeClassifierCV(
|
2247
2367
|
class_weight=class_weight, store_cv_results=True
|
2248
2368
|
),
|
2249
|
-
"Perceptron": Perceptron(random_state=random_state),
|
2369
|
+
"Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
|
2250
2370
|
"Bernoulli Naive Bayes": BernoulliNB(),
|
2251
|
-
"SGDClassifier": SGDClassifier(random_state=random_state),
|
2371
|
+
"SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
|
2252
2372
|
}
|
2253
2373
|
elif purpose == "regression":
|
2254
2374
|
model_ = {
|
2255
|
-
"Random Forest": RandomForestRegressor(random_state=random_state),
|
2375
|
+
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2256
2376
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2257
2377
|
# "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
|
2258
2378
|
"LassoCV": LassoCV(
|
2259
|
-
cv=cv_folds, random_state=random_state
|
2379
|
+
cv=cv_folds, random_state=random_state,n_jobs=n_jobs
|
2260
2380
|
), # LassoCV自动找出最适alpha,优于Lasso
|
2261
2381
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2262
|
-
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2263
|
-
"Linear Regression": LinearRegression(),
|
2264
|
-
"Lasso": Lasso(random_state=random_state),
|
2382
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2383
|
+
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2265
2384
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2266
|
-
|
2385
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
|
2386
|
+
force_row_wise=True # Or use force_col_wise=True if memory is a concern
|
2387
|
+
),
|
2267
2388
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2268
|
-
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
2269
|
-
"Bagging": BaggingRegressor(random_state=random_state),
|
2389
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2390
|
+
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2270
2391
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2271
2392
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2272
2393
|
"Ridge": Ridge(),
|
2273
|
-
"KNN": KNeighborsRegressor(),
|
2394
|
+
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2395
|
+
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2396
|
+
"Huber":HuberRegressor(),
|
2397
|
+
"Poisson":PoissonRegressor()
|
2274
2398
|
}
|
2275
|
-
# indicate cls:
|
2276
|
-
if ips.run_once_within(30): # 10 min
|
2277
|
-
print(f"supported models: {list(model_.keys())}")
|
2278
2399
|
if cls is None:
|
2279
2400
|
models = model_
|
2280
2401
|
else:
|
@@ -2290,6 +2411,10 @@ def predict(
|
|
2290
2411
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2291
2412
|
)
|
2292
2413
|
|
2414
|
+
# indicate cls:
|
2415
|
+
if ips.run_once_within(30): # 10 min
|
2416
|
+
print(f"processing: {list(models.keys())}")
|
2417
|
+
print(isinstance(y_train, str) and y_train in x_train.columns)
|
2293
2418
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2294
2419
|
y_train_col_name = y_train
|
2295
2420
|
y_train = x_train[y_train]
|
@@ -2297,6 +2422,7 @@ def predict(
|
|
2297
2422
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2298
2423
|
# else:
|
2299
2424
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2425
|
+
|
2300
2426
|
y_train = pd.DataFrame(y_train)
|
2301
2427
|
if y_train.select_dtypes(include=np.number).empty:
|
2302
2428
|
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
@@ -2309,9 +2435,12 @@ def predict(
|
|
2309
2435
|
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2310
2436
|
print("is_binary:", is_binary)
|
2311
2437
|
|
2438
|
+
if fill_missing:
|
2439
|
+
ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
|
2440
|
+
ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
|
2312
2441
|
# Perform backward feature selection
|
2313
2442
|
if backward:
|
2314
|
-
selected_features = backward_regression(x_train, y_train,
|
2443
|
+
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
2315
2444
|
x_train = x_train[selected_features]
|
2316
2445
|
|
2317
2446
|
if x_true is None:
|
@@ -2337,6 +2466,8 @@ def predict(
|
|
2337
2466
|
pd.DataFrame(y_train), method="label"
|
2338
2467
|
).values.ravel()
|
2339
2468
|
|
2469
|
+
if fill_missing:
|
2470
|
+
ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
|
2340
2471
|
if y_true is not None:
|
2341
2472
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2342
2473
|
y_true_col_name = y_true
|
@@ -2369,11 +2500,16 @@ def predict(
|
|
2369
2500
|
# Ensure common features are selected
|
2370
2501
|
if common_features is not None:
|
2371
2502
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
2503
|
+
share_col_names=common_features
|
2372
2504
|
else:
|
2373
2505
|
share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
|
2374
2506
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2375
2507
|
|
2376
|
-
|
2508
|
+
#! scaler
|
2509
|
+
# scaler and fit x_train and export scaler to fit the x_true
|
2510
|
+
x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
|
2511
|
+
#
|
2512
|
+
x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
|
2377
2513
|
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2378
2514
|
x_true, method="dummy"
|
2379
2515
|
)
|
@@ -2395,7 +2531,19 @@ def predict(
|
|
2395
2531
|
if isinstance(y_train, np.ndarray):
|
2396
2532
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2397
2533
|
y_true = np.asarray(y_true)
|
2534
|
+
|
2398
2535
|
# Hyperparameter grids for tuning
|
2536
|
+
param_grid_common_xgb = {
|
2537
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2538
|
+
'max_depth': [3, 5, 7, 10],
|
2539
|
+
'n_estimators': [50, 100, 200, 300],
|
2540
|
+
'subsample': [0.6, 0.8, 1.0],
|
2541
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2542
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2543
|
+
'min_child_weight': [1, 5, 10],
|
2544
|
+
'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
|
2545
|
+
'reg_lambda': [1, 1.5, 2], # L2 regularization term
|
2546
|
+
}
|
2399
2547
|
if cv_level in ["low", "simple", "s", "l"]:
|
2400
2548
|
param_grids = {
|
2401
2549
|
"Random Forest": (
|
@@ -2440,12 +2588,17 @@ def predict(
|
|
2440
2588
|
"min_samples_split": [2],
|
2441
2589
|
"subsample": [0.8],
|
2442
2590
|
},
|
2443
|
-
"XGBoost":
|
2444
|
-
|
2445
|
-
|
2446
|
-
|
2447
|
-
|
2448
|
-
|
2591
|
+
"XGBoost":{
|
2592
|
+
'learning_rate': [0.01],
|
2593
|
+
'max_depth': [3],
|
2594
|
+
'n_estimators': [50],
|
2595
|
+
'subsample': [0.6],
|
2596
|
+
'colsample_bytree': [0.6],
|
2597
|
+
'gamma': [0, 0.1],
|
2598
|
+
'min_child_weight': [1],
|
2599
|
+
'reg_alpha': [0, 0.1],
|
2600
|
+
'reg_lambda': [1],
|
2601
|
+
'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
|
2449
2602
|
},
|
2450
2603
|
"KNN": (
|
2451
2604
|
{
|
@@ -2552,6 +2705,14 @@ def predict(
|
|
2552
2705
|
"random_state": [random_state],
|
2553
2706
|
"learning_rate": ["constant"],
|
2554
2707
|
},
|
2708
|
+
"TheilSen":{'max_iter': [100],
|
2709
|
+
'tol': [1e-4],
|
2710
|
+
'n_subsamples': [100+x_train.shape[1]]},
|
2711
|
+
"Huber":{'epsilon': [1.35],
|
2712
|
+
'alpha': [0.1],
|
2713
|
+
'max_iter': [100],},
|
2714
|
+
"Poisson":{'alpha': [0.1],
|
2715
|
+
'max_iter': [100],}
|
2555
2716
|
}
|
2556
2717
|
elif cv_level in ["high", "advanced", "h"]:
|
2557
2718
|
param_grids = {
|
@@ -2613,12 +2774,30 @@ def predict(
|
|
2613
2774
|
"subsample": [0.8, 1.0],
|
2614
2775
|
},
|
2615
2776
|
"XGBoost": {
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2777
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2778
|
+
'max_depth': [3, 5, 7, 10],
|
2779
|
+
'n_estimators': [50, 100, 200, 300],
|
2780
|
+
'subsample': [0.6, 0.8, 1.0],
|
2781
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2782
|
+
'min_child_weight': [1, 5, 10],
|
2783
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2784
|
+
'reg_lambda': [1, 1.5, 2],
|
2785
|
+
**{
|
2786
|
+
'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
|
2787
|
+
}} if purpose== "classification"
|
2788
|
+
else{
|
2789
|
+
'learning_rate': [0.01, 0.1, 0.2, 0.3],
|
2790
|
+
'max_depth': [3, 5, 7, 10],
|
2791
|
+
'n_estimators': [50, 100, 200, 300],
|
2792
|
+
'subsample': [0.6, 0.8, 1.0],
|
2793
|
+
'colsample_bytree': [0.6, 0.8, 1.0],
|
2794
|
+
'gamma': [0, 0.1, 0.2, 0.5],
|
2795
|
+
'min_child_weight': [1, 5, 10],
|
2796
|
+
'reg_alpha': [0, 0.1, 0.5, 1],
|
2797
|
+
'reg_lambda': [1, 1.5, 2],
|
2798
|
+
**{
|
2799
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
|
2800
|
+
}},
|
2622
2801
|
"KNN": (
|
2623
2802
|
{
|
2624
2803
|
"n_neighbors": [1, 3, 5, 10, 15, 20],
|
@@ -2731,6 +2910,14 @@ def predict(
|
|
2731
2910
|
], # If True, the regressors X will be normalized
|
2732
2911
|
}
|
2733
2912
|
),
|
2913
|
+
"TheilSen":{'max_iter': [100, 200, 300],
|
2914
|
+
'tol': [1e-4, 1e-3, 1e-2],
|
2915
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
|
2916
|
+
"Huber":{'epsilon': [1.35, 1.5, 2.0],
|
2917
|
+
'alpha': [0.1, 1.0, 10.0],
|
2918
|
+
'max_iter': [100, 200, 300],},
|
2919
|
+
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2920
|
+
'max_iter': [100, 200, 300],}
|
2734
2921
|
}
|
2735
2922
|
else: # median level
|
2736
2923
|
param_grids = {
|
@@ -2790,12 +2977,30 @@ def predict(
|
|
2790
2977
|
"subsample": [0.8, 1.0],
|
2791
2978
|
},
|
2792
2979
|
"XGBoost": {
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
|
2797
|
-
|
2798
|
-
|
2980
|
+
'learning_rate': [0.01, 0.1],
|
2981
|
+
'max_depth': [3, 5],
|
2982
|
+
'n_estimators': [50, 100],
|
2983
|
+
'subsample': [0.6, 0.8],
|
2984
|
+
'gamma': [0, 0.1],
|
2985
|
+
'min_child_weight': [1, 5],
|
2986
|
+
'reg_alpha': [0, 0.1],
|
2987
|
+
'reg_lambda': [1,],
|
2988
|
+
**{
|
2989
|
+
'objective': ['binary:logistic', 'multi:softmax'],
|
2990
|
+
}} if purpose== "classification"
|
2991
|
+
else{
|
2992
|
+
'learning_rate': [0.01, 0.1],
|
2993
|
+
'max_depth': [3, 5,],
|
2994
|
+
'n_estimators': [50, 100],
|
2995
|
+
'subsample': [0.6, 0.8],
|
2996
|
+
'colsample_bytree': [0.6, 0.8],
|
2997
|
+
'gamma': [0, 0.1],
|
2998
|
+
'min_child_weight': [1, 5],
|
2999
|
+
'reg_alpha': [0, 0.1],
|
3000
|
+
'reg_lambda': [1, 1.5],
|
3001
|
+
**{
|
3002
|
+
'objective': ['reg:squarederror', 'reg:squaredlogerror'],
|
3003
|
+
}},
|
2799
3004
|
"KNN": (
|
2800
3005
|
{
|
2801
3006
|
"n_neighbors": [3, 5, 7, 10],
|
@@ -2952,6 +3157,14 @@ def predict(
|
|
2952
3157
|
], # Solver for optimization
|
2953
3158
|
}
|
2954
3159
|
),
|
3160
|
+
"TheilSen":{'max_iter': [100, 200],
|
3161
|
+
'tol': [1e-4, 1e-3],
|
3162
|
+
'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
|
3163
|
+
"Huber":{'epsilon': [1.35, 1.5],
|
3164
|
+
'alpha': [0.1, 1.0],
|
3165
|
+
'max_iter': [100, 200],},
|
3166
|
+
"Poisson":{'alpha': [0.1, 1.0],
|
3167
|
+
'max_iter': [100, 200],}
|
2955
3168
|
}
|
2956
3169
|
|
2957
3170
|
results = {}
|
@@ -2971,83 +3184,124 @@ def predict(
|
|
2971
3184
|
):
|
2972
3185
|
if verbose:
|
2973
3186
|
print(f"\nTraining and validating {name}:")
|
2974
|
-
|
2975
|
-
|
2976
|
-
|
2977
|
-
|
2978
|
-
|
2979
|
-
|
2980
|
-
|
2981
|
-
|
2982
|
-
|
2983
|
-
|
2984
|
-
|
2985
|
-
|
2986
|
-
|
2987
|
-
|
2988
|
-
)
|
2989
|
-
|
2990
|
-
gs.fit(x_train, y_train)
|
2991
|
-
best_clf = gs.best_estimator_
|
2992
|
-
# make sure x_train and x_test has the same name
|
2993
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2994
|
-
y_pred = best_clf.predict(x_true)
|
2995
|
-
if hasattr(best_clf, "predict_proba"):
|
2996
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
2997
|
-
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
2998
|
-
if y_pred_proba.shape[1] == 1:
|
2999
|
-
y_pred_proba = np.hstack(
|
3000
|
-
[1 - y_pred_proba, y_pred_proba]
|
3001
|
-
) # Add missing class probabilities
|
3002
|
-
y_pred_proba = y_pred_proba[:, 1]
|
3003
|
-
elif hasattr(best_clf, "decision_function"):
|
3004
|
-
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3005
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3006
|
-
# Ensure y_pred_proba is within 0 and 1 bounds
|
3007
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3008
|
-
y_pred_proba.max() - y_pred_proba.min()
|
3187
|
+
try:
|
3188
|
+
# Grid search with KFold or StratifiedKFold
|
3189
|
+
if is_binary:
|
3190
|
+
gs = GridSearchCV(
|
3191
|
+
clf,
|
3192
|
+
param_grid=param_grids.get(name, {}),
|
3193
|
+
scoring=(
|
3194
|
+
"roc_auc"
|
3195
|
+
if purpose == "classification"
|
3196
|
+
else "neg_mean_squared_error"
|
3197
|
+
),
|
3198
|
+
cv=cv,
|
3199
|
+
n_jobs=n_jobs,
|
3200
|
+
verbose=verbose,
|
3009
3201
|
)
|
3010
|
-
else:
|
3011
|
-
y_pred_proba = None # No probability output for certain models
|
3012
|
-
else:
|
3013
|
-
gs = GridSearchCV(
|
3014
|
-
clf,
|
3015
|
-
param_grid=param_grids.get(name, {}),
|
3016
|
-
scoring=(
|
3017
|
-
"roc_auc_ovr"
|
3018
|
-
if purpose == "classification"
|
3019
|
-
else "neg_mean_squared_error"
|
3020
|
-
),
|
3021
|
-
cv=cv,
|
3022
|
-
n_jobs=n_jobs,
|
3023
|
-
verbose=verbose,
|
3024
|
-
)
|
3025
|
-
|
3026
|
-
# Fit GridSearchCV
|
3027
|
-
gs.fit(x_train, y_train)
|
3028
|
-
best_clf = gs.best_estimator_
|
3029
|
-
|
3030
|
-
# Ensure x_true aligns with x_train columns
|
3031
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3032
|
-
y_pred = best_clf.predict(x_true)
|
3033
|
-
|
3034
|
-
# Handle prediction probabilities for multiclass
|
3035
|
-
if hasattr(best_clf, "predict_proba"):
|
3036
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3037
|
-
elif hasattr(best_clf, "decision_function"):
|
3038
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3039
3202
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
|
3043
|
-
|
3044
|
-
|
3045
|
-
|
3046
|
-
|
3203
|
+
gs.fit(x_train, y_train)
|
3204
|
+
best_clf = gs.best_estimator_
|
3205
|
+
# make sure x_train and x_test has the same name
|
3206
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3207
|
+
y_pred = best_clf.predict(x_true)
|
3208
|
+
if hasattr(best_clf, "predict_proba"):
|
3209
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3210
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3211
|
+
if y_pred_proba.shape[1] == 1:
|
3212
|
+
y_pred_proba = np.hstack(
|
3213
|
+
[1 - y_pred_proba, y_pred_proba]
|
3214
|
+
) # Add missing class probabilities
|
3215
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3216
|
+
elif hasattr(best_clf, "decision_function"):
|
3217
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3218
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3219
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
3220
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3221
|
+
y_pred_proba.max() - y_pred_proba.min()
|
3047
3222
|
)
|
3223
|
+
else:
|
3224
|
+
y_pred_proba = None # No probability output for certain models
|
3225
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3226
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3227
|
+
if hasattr(best_clf, "alphas_"):
|
3228
|
+
alphas_ = best_clf.alphas_
|
3229
|
+
elif hasattr(best_clf, "alpha_"):
|
3230
|
+
alphas_ = best_clf.alpha_
|
3231
|
+
elif hasattr(best_clf, "Cs_"):
|
3232
|
+
alphas_ = best_clf.Cs_
|
3233
|
+
else:
|
3234
|
+
alphas_= None
|
3235
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3048
3236
|
else:
|
3049
|
-
|
3237
|
+
gs = GridSearchCV(
|
3238
|
+
clf,
|
3239
|
+
param_grid=param_grids.get(name, {}),
|
3240
|
+
scoring=(
|
3241
|
+
"roc_auc_ovr"
|
3242
|
+
if purpose == "classification"
|
3243
|
+
else "neg_mean_squared_error"
|
3244
|
+
),
|
3245
|
+
cv=cv,
|
3246
|
+
n_jobs=n_jobs,
|
3247
|
+
verbose=verbose,
|
3248
|
+
)
|
3050
3249
|
|
3250
|
+
# Fit GridSearchCV
|
3251
|
+
gs.fit(x_train, y_train)
|
3252
|
+
best_clf = gs.best_estimator_
|
3253
|
+
|
3254
|
+
# Ensure x_true aligns with x_train columns
|
3255
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3256
|
+
|
3257
|
+
# do i need to fit the x_train, y_train again?
|
3258
|
+
best_clf=best_clf.fit(x_train, y_train)
|
3259
|
+
y_pred = best_clf.predict(x_true)
|
3260
|
+
|
3261
|
+
# Handle prediction probabilities for multiclass
|
3262
|
+
if hasattr(best_clf, "predict_proba"):
|
3263
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3264
|
+
elif hasattr(best_clf, "decision_function"):
|
3265
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3266
|
+
|
3267
|
+
# Normalize for multiclass if necessary
|
3268
|
+
if y_pred_proba.ndim == 2:
|
3269
|
+
y_pred_proba = (
|
3270
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3271
|
+
) / (
|
3272
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3273
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3274
|
+
)
|
3275
|
+
else:
|
3276
|
+
y_pred_proba = None # No probability output for certain models
|
3277
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3278
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3279
|
+
if hasattr(best_clf, "alphas_"):
|
3280
|
+
alphas_ = best_clf.alphas_
|
3281
|
+
elif hasattr(best_clf, "alpha_"):
|
3282
|
+
alphas_ = best_clf.alpha_
|
3283
|
+
elif hasattr(best_clf, "Cs_"):
|
3284
|
+
alphas_ = best_clf.Cs_
|
3285
|
+
else:
|
3286
|
+
alphas_= None
|
3287
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3288
|
+
except Exception as e:
|
3289
|
+
alphas_,coef_ = None,None
|
3290
|
+
print(f"skiped {clf}: {e}")
|
3291
|
+
continue
|
3292
|
+
# try to make predict format consistant
|
3293
|
+
try:
|
3294
|
+
y_pred= [i[0] for i in y_pred]
|
3295
|
+
except:
|
3296
|
+
pass
|
3297
|
+
try:
|
3298
|
+
y_true= [i[0] for i in y_true]
|
3299
|
+
except:
|
3300
|
+
pass
|
3301
|
+
try:
|
3302
|
+
y_train= [i[0] for i in y_train]
|
3303
|
+
except:
|
3304
|
+
pass
|
3051
3305
|
validation_scores = {}
|
3052
3306
|
|
3053
3307
|
if y_true is not None and y_pred_proba is not None:
|
@@ -3097,20 +3351,26 @@ def predict(
|
|
3097
3351
|
"roc_curve": roc_info,
|
3098
3352
|
"pr_curve": pr_info,
|
3099
3353
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3100
|
-
"predictions": y_pred
|
3354
|
+
"predictions": y_pred,#.tolist(),
|
3101
3355
|
"predictions_proba": (
|
3102
3356
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3103
3357
|
),
|
3358
|
+
"features":share_col_names,
|
3359
|
+
"coef":coef_,
|
3360
|
+
"alphas":alphas_
|
3104
3361
|
}
|
3105
3362
|
else: # "regression"
|
3106
3363
|
results[name] = {
|
3107
3364
|
"best_clf": gs.best_estimator_,
|
3108
3365
|
"best_params": gs.best_params_,
|
3109
3366
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3110
|
-
"predictions": y_pred
|
3367
|
+
"predictions": y_pred,#.tolist(),
|
3111
3368
|
"predictions_proba": (
|
3112
3369
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3113
3370
|
),
|
3371
|
+
"features":share_col_names,
|
3372
|
+
"coef":coef_,
|
3373
|
+
"alphas":alphas_
|
3114
3374
|
}
|
3115
3375
|
else: # multi-classes
|
3116
3376
|
if y_pred_proba is not None:
|
@@ -3149,20 +3409,26 @@ def predict(
|
|
3149
3409
|
"roc_curve": roc_info,
|
3150
3410
|
"pr_curve": pr_info,
|
3151
3411
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3152
|
-
"predictions": y_pred
|
3412
|
+
"predictions": y_pred,#.tolist(),
|
3153
3413
|
"predictions_proba": (
|
3154
3414
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3155
3415
|
),
|
3416
|
+
"features":share_col_names,
|
3417
|
+
"coef":coef_,
|
3418
|
+
"alphas":alphas_
|
3156
3419
|
}
|
3157
3420
|
else: # "regression"
|
3158
3421
|
results[name] = {
|
3159
3422
|
"best_clf": gs.best_estimator_,
|
3160
3423
|
"best_params": gs.best_params_,
|
3161
3424
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3162
|
-
"predictions": y_pred
|
3425
|
+
"predictions": y_pred,#.tolist(),
|
3163
3426
|
"predictions_proba": (
|
3164
3427
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3165
3428
|
),
|
3429
|
+
"features":share_col_names,
|
3430
|
+
"coef":coef_,
|
3431
|
+
"alphas":alphas_
|
3166
3432
|
}
|
3167
3433
|
|
3168
3434
|
else:
|
@@ -3181,23 +3447,32 @@ def predict(
|
|
3181
3447
|
"best_clf": gs.best_estimator_,
|
3182
3448
|
"best_params": gs.best_params_,
|
3183
3449
|
"scores": validation_scores,
|
3184
|
-
"predictions": y_pred
|
3450
|
+
"predictions": y_pred,#.tolist(),
|
3185
3451
|
"predictions_proba": (
|
3186
3452
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3187
3453
|
),
|
3454
|
+
"features":share_col_names,
|
3188
3455
|
"y_train": y_train if y_train is not None else [],
|
3189
3456
|
"y_true": y_true if y_true is not None else [],
|
3457
|
+
"coef":coef_,
|
3458
|
+
"alphas":alphas_
|
3190
3459
|
}
|
3191
3460
|
|
3192
3461
|
# Convert results to DataFrame
|
3193
3462
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3194
3463
|
# sort
|
3195
|
-
if y_true is not None
|
3196
|
-
|
3197
|
-
|
3198
|
-
|
3464
|
+
if y_true is not None:
|
3465
|
+
if purpose == "classification":
|
3466
|
+
df_scores = pd.DataFrame(
|
3467
|
+
df_results["scores"].tolist(), index=df_results["scores"].index
|
3468
|
+
).sort_values(by="roc_auc", ascending=False)
|
3469
|
+
elif purpose=='regression':
|
3470
|
+
df_scores = rank_models_reg(
|
3471
|
+
pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
|
3472
|
+
ascending=False)
|
3199
3473
|
df_results = df_results.loc[df_scores.index]
|
3200
3474
|
|
3475
|
+
if y_true is not None and purpose == "classification":
|
3201
3476
|
if plot_:
|
3202
3477
|
from datetime import datetime
|
3203
3478
|
|
@@ -3215,18 +3490,565 @@ def predict(
|
|
3215
3490
|
plot.figsets(xangle=30)
|
3216
3491
|
if dir_save:
|
3217
3492
|
ips.figsave(dir_save + f"scores_clus{now_}.pdf")
|
3493
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3494
|
+
# # try:
|
3495
|
+
# if len(models) > 3:
|
3496
|
+
# plot_validate_features(df_results, is_binary=is_binary)
|
3497
|
+
# else:
|
3498
|
+
# plot_validate_features_single(df_results, is_binary=is_binary)
|
3499
|
+
# if dir_save:
|
3500
|
+
# ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3501
|
+
# # except Exception as e:
|
3502
|
+
# # print(f"Error: 在画图的过程中出现了问题:{e}")
|
3503
|
+
if stack:
|
3504
|
+
#! stacking classifier/regressor
|
3505
|
+
from sklearn.metrics import make_scorer, accuracy_score
|
3506
|
+
from sklearn.model_selection import cross_val_score
|
3507
|
+
|
3508
|
+
#* n_top_models防止超过index
|
3509
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3510
|
+
|
3511
|
+
#* 选择出排名靠前的n个, estimators
|
3512
|
+
models_selecte = select_top_models(models=list(df_results.index),
|
3513
|
+
categories=models_support[purpose],
|
3514
|
+
n_top_models=n_top_models,
|
3515
|
+
n_models_per_category=n_models_per_category)
|
3516
|
+
top_models = df_results.loc[models_selecte]["best_clf"]
|
3517
|
+
base_estimators = []
|
3518
|
+
for i, j in top_models.to_dict().items():
|
3519
|
+
base_estimators.append((i, j))
|
3520
|
+
if stacking_cv:
|
3521
|
+
print(f"⤵ stacking_cv is processing...")
|
3522
|
+
#* 定义几个象征性的final_estimator
|
3523
|
+
# 备选的几种
|
3524
|
+
if purpose == "classification":
|
3525
|
+
kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
|
3526
|
+
else:
|
3527
|
+
kadt_estimators=["XGBoost","LassoCV"]
|
3528
|
+
final_estimators={}
|
3529
|
+
for name in kadt_estimators:
|
3530
|
+
param_grid=param_grids.get(name, {})
|
3531
|
+
print(param_grid)
|
3532
|
+
if is_binary:
|
3533
|
+
gs = GridSearchCV(
|
3534
|
+
model_[name],
|
3535
|
+
param_grid=param_grid,
|
3536
|
+
scoring=(
|
3537
|
+
"roc_auc"
|
3538
|
+
if purpose == "classification"
|
3539
|
+
else "neg_mean_squared_error"
|
3540
|
+
),
|
3541
|
+
cv=cv,
|
3542
|
+
n_jobs=n_jobs,
|
3543
|
+
verbose=verbose,
|
3544
|
+
)
|
3545
|
+
else:
|
3546
|
+
gs = GridSearchCV(
|
3547
|
+
model_[name],
|
3548
|
+
param_grid=param_grid,
|
3549
|
+
scoring=(
|
3550
|
+
"roc_auc_ovr"
|
3551
|
+
if purpose == "classification"
|
3552
|
+
else "neg_mean_squared_error"
|
3553
|
+
),
|
3554
|
+
cv=cv,
|
3555
|
+
n_jobs=n_jobs,
|
3556
|
+
verbose=verbose,
|
3557
|
+
)
|
3558
|
+
# Fit GridSearchCV
|
3559
|
+
gs.fit(x_train, y_train)
|
3560
|
+
final_estimators[name]=gs.best_estimator_
|
3561
|
+
|
3562
|
+
#* Set up cross-validation and performance evaluation
|
3563
|
+
scorer = make_scorer(accuracy_score)
|
3564
|
+
cv_results = []
|
3565
|
+
|
3566
|
+
#*Cross-validate stacking models with different final estimators
|
3567
|
+
for final_name, final_estimator in final_estimators.items():
|
3568
|
+
print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
|
3569
|
+
if purpose == "classification":
|
3570
|
+
stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
|
3571
|
+
else:
|
3572
|
+
stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
|
3573
|
+
|
3574
|
+
scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
|
3575
|
+
|
3576
|
+
# Store the result
|
3577
|
+
cv_results.append({
|
3578
|
+
'final_estimator':final_estimator,
|
3579
|
+
'Final Estimator': final_name,
|
3580
|
+
'Mean Accuracy': np.mean(scores),
|
3581
|
+
'Standard Deviation': np.std(scores)
|
3582
|
+
})
|
3583
|
+
|
3584
|
+
#* Convert the results into a DataFrame for easy comparison
|
3585
|
+
cv_results_df = pd.DataFrame(cv_results)
|
3586
|
+
|
3587
|
+
#* Sort and display the best model
|
3588
|
+
cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
|
3589
|
+
|
3590
|
+
|
3591
|
+
# Optionally: Select the final estimator that gives the best performance
|
3592
|
+
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3593
|
+
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3594
|
+
else:
|
3595
|
+
print(f"⤵ trying to find the best_final_estimator for stacking...")
|
3596
|
+
if purpose=="classification":
|
3597
|
+
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3598
|
+
random_state=random_state,
|
3599
|
+
max_iter=1000)
|
3600
|
+
else:
|
3601
|
+
best_final_estimator = RidgeCV(cv=5)
|
3602
|
+
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3603
|
+
#! apply stacking
|
3604
|
+
if purpose == "classification":
|
3605
|
+
print(f"⤵ StackingClassifier...")
|
3606
|
+
stacking_model = StackingClassifier(estimators=base_estimators,
|
3607
|
+
final_estimator=best_final_estimator,
|
3608
|
+
cv=cv)
|
3609
|
+
else:
|
3610
|
+
print(f"⤵ StackingRegressor...")
|
3611
|
+
stacking_model = StackingRegressor(estimators=base_estimators,
|
3612
|
+
final_estimator=best_final_estimator,
|
3613
|
+
cv=cv)
|
3614
|
+
|
3615
|
+
# Train the Stacking Classifier
|
3616
|
+
print(f"⤵ fit & predict...")
|
3617
|
+
stacking_model.fit(x_train, y_train)
|
3618
|
+
y_pred_final = stacking_model.predict(x_true)
|
3619
|
+
print(f"⤵ collecting results...")
|
3620
|
+
# pred_proba
|
3621
|
+
if is_binary:
|
3622
|
+
if hasattr(stacking_model, "predict_proba"):
|
3623
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3624
|
+
if y_pred_proba_final.shape[1] == 1:
|
3625
|
+
y_pred_proba_final = np.hstack(
|
3626
|
+
[1 - y_pred_proba_final, y_pred_proba_final]
|
3627
|
+
) # Add missing class probabilities
|
3628
|
+
y_pred_proba_final = y_pred_proba_final[:, 1]
|
3629
|
+
elif hasattr(stacking_model, "decision_function"):
|
3630
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3631
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3632
|
+
# Ensure y_pred_proba_final is within 0 and 1 bounds
|
3633
|
+
y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
|
3634
|
+
y_pred_proba_final.max() - y_pred_proba_final.min()
|
3635
|
+
)
|
3636
|
+
else:
|
3637
|
+
y_pred_proba_final = None # No probability output for certain models
|
3638
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3639
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3640
|
+
if hasattr(best_clf, "alphas_"):
|
3641
|
+
alphas_ = best_clf.alphas_
|
3642
|
+
elif hasattr(best_clf, "alpha_"):
|
3643
|
+
alphas_ = best_clf.alpha_
|
3644
|
+
elif hasattr(best_clf, "Cs_"):
|
3645
|
+
alphas_ = best_clf.Cs_
|
3646
|
+
else:
|
3647
|
+
alphas_= None
|
3648
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3649
|
+
if not is_binary:
|
3650
|
+
# Handle prediction probabilities for multiclass
|
3651
|
+
if hasattr(stacking_model, "predict_proba"):
|
3652
|
+
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3653
|
+
elif hasattr(stacking_model, "decision_function"):
|
3654
|
+
y_pred_proba_final = stacking_model.decision_function(x_true)
|
3655
|
+
|
3656
|
+
# Normalize for multiclass if necessary
|
3657
|
+
if y_pred_proba_final.ndim == 2:
|
3658
|
+
y_pred_proba_final = (
|
3659
|
+
y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
|
3660
|
+
) / (
|
3661
|
+
y_pred_proba_final.max(axis=1, keepdims=True)
|
3662
|
+
- y_pred_proba_final.min(axis=1, keepdims=True)
|
3663
|
+
)
|
3664
|
+
else:
|
3665
|
+
y_pred_proba_final = None # No probability output for certain models
|
3666
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3667
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3668
|
+
if hasattr(best_clf, "alphas_"):
|
3669
|
+
alphas_ = best_clf.alphas_
|
3670
|
+
elif hasattr(best_clf, "alpha_"):
|
3671
|
+
alphas_ = best_clf.alpha_
|
3672
|
+
elif hasattr(best_clf, "Cs_"):
|
3673
|
+
alphas_ = best_clf.Cs_
|
3674
|
+
else:
|
3675
|
+
alphas_= None
|
3676
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3677
|
+
#! dict_pred_stack
|
3678
|
+
dict_pred_stack={}
|
3679
|
+
validation_scores_final = {}
|
3680
|
+
if y_true is not None and y_pred_proba_final is not None:
|
3681
|
+
validation_scores_final = cal_metrics(
|
3682
|
+
y_true,
|
3683
|
+
y_pred_final,
|
3684
|
+
y_pred_proba=y_pred_proba_final,
|
3685
|
+
is_binary=is_binary,
|
3686
|
+
purpose=purpose,
|
3687
|
+
average="weighted",
|
3688
|
+
)
|
3689
|
+
if is_binary:
|
3690
|
+
# Calculate ROC curve
|
3691
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
3692
|
+
if y_pred_proba_final is not None:
|
3693
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3694
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3695
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3696
|
+
)
|
3697
|
+
roc_auc = auc(fpr, tpr)
|
3698
|
+
roc_info = {
|
3699
|
+
"fpr": fpr.tolist(),
|
3700
|
+
"tpr": tpr.tolist(),
|
3701
|
+
"auc": roc_auc,
|
3702
|
+
"ci95": (lower_ci, upper_ci),
|
3703
|
+
}
|
3704
|
+
# precision-recall curve
|
3705
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
|
3706
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
|
3707
|
+
pr_info = {
|
3708
|
+
"precision": precision_,
|
3709
|
+
"recall": recall_,
|
3710
|
+
"avg_precision": avg_precision_,
|
3711
|
+
}
|
3712
|
+
else:
|
3713
|
+
roc_info, pr_info = None, None
|
3714
|
+
if purpose == "classification":
|
3715
|
+
dict_pred_stack = {
|
3716
|
+
"best_clf": stacking_model,
|
3717
|
+
"best_params": None,
|
3718
|
+
"auc_indiv": None,
|
3719
|
+
"scores": validation_scores_final,
|
3720
|
+
"roc_curve": roc_info,
|
3721
|
+
"pr_curve": pr_info,
|
3722
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3723
|
+
"predictions": y_pred_final.tolist(),
|
3724
|
+
"predictions_proba": (
|
3725
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3726
|
+
),
|
3727
|
+
"features":share_col_names,
|
3728
|
+
"coef":coef_,
|
3729
|
+
"alphas":alphas_
|
3730
|
+
}
|
3731
|
+
else: # "regression"
|
3732
|
+
dict_pred_stack = {
|
3733
|
+
"best_clf": stacking_model,
|
3734
|
+
"best_params": None,
|
3735
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3736
|
+
"predictions": y_pred_final.tolist(),
|
3737
|
+
"predictions_proba": (
|
3738
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3739
|
+
),
|
3740
|
+
"features":share_col_names,
|
3741
|
+
"coef":coef_,
|
3742
|
+
"alphas":alphas_
|
3743
|
+
}
|
3744
|
+
else: # multi-classes
|
3745
|
+
if y_pred_proba_final is not None:
|
3746
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3747
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
|
3748
|
+
confidence_intervals = cal_auc_ci(
|
3749
|
+
y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
|
3750
|
+
)
|
3751
|
+
roc_info = {
|
3752
|
+
"fpr": validation_scores_final["fpr"],
|
3753
|
+
"tpr": validation_scores_final["tpr"],
|
3754
|
+
"auc": validation_scores_final["roc_auc_by_class"],
|
3755
|
+
"ci95": confidence_intervals,
|
3756
|
+
}
|
3757
|
+
# precision-recall curve
|
3758
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3759
|
+
y_true, y_pred_proba_final, is_binary=is_binary
|
3760
|
+
)
|
3761
|
+
pr_info = {
|
3762
|
+
"precision": precision_,
|
3763
|
+
"recall": recall_,
|
3764
|
+
"avg_precision": avg_precision_,
|
3765
|
+
}
|
3766
|
+
else:
|
3767
|
+
roc_info, pr_info = None, None
|
3768
|
+
|
3769
|
+
if purpose == "classification":
|
3770
|
+
dict_pred_stack = {
|
3771
|
+
"best_clf": stacking_model,
|
3772
|
+
"best_params": None,
|
3773
|
+
"auc_indiv": None,
|
3774
|
+
"scores": validation_scores_final,
|
3775
|
+
"roc_curve": roc_info,
|
3776
|
+
"pr_curve": pr_info,
|
3777
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_final),
|
3778
|
+
"predictions": y_pred_final.tolist(),
|
3779
|
+
"predictions_proba": (
|
3780
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3781
|
+
),
|
3782
|
+
"features":share_col_names,
|
3783
|
+
"coef":coef_,
|
3784
|
+
"alphas":alphas_
|
3785
|
+
}
|
3786
|
+
else: # "regression"
|
3787
|
+
dict_pred_stack = {
|
3788
|
+
"best_clf": stacking_model,
|
3789
|
+
"best_params": None,
|
3790
|
+
"scores": validation_scores_final, # e.g., neg_MSE, R², etc.
|
3791
|
+
"predictions": y_pred_final.tolist(),
|
3792
|
+
"predictions_proba": (
|
3793
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3794
|
+
),
|
3795
|
+
"features":share_col_names,
|
3796
|
+
"coef":coef_,
|
3797
|
+
"alphas":alphas_
|
3798
|
+
}
|
3799
|
+
|
3800
|
+
else:
|
3801
|
+
if y_true is None:
|
3802
|
+
validation_scores_final = []
|
3803
|
+
else:
|
3804
|
+
validation_scores_final = cal_metrics(
|
3805
|
+
y_true,
|
3806
|
+
y_pred,
|
3807
|
+
y_pred_proba=y_pred_proba_final,
|
3808
|
+
is_binary=is_binary,
|
3809
|
+
purpose=purpose,
|
3810
|
+
average="weighted",
|
3811
|
+
)
|
3812
|
+
dict_pred_stack = {
|
3813
|
+
"best_clf": stacking_model,
|
3814
|
+
"best_params": None,
|
3815
|
+
"scores": validation_scores_final,
|
3816
|
+
"predictions": y_pred_final.tolist(),
|
3817
|
+
"predictions_proba": (
|
3818
|
+
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3819
|
+
),
|
3820
|
+
"features":share_col_names,
|
3821
|
+
"y_train": y_train if y_train is not None else [],
|
3822
|
+
"y_true": y_true if y_true is not None else [],
|
3823
|
+
"coef":coef_,
|
3824
|
+
"alphas":alphas_
|
3825
|
+
}
|
3826
|
+
# merge together
|
3827
|
+
df_pred = pd.DataFrame(
|
3828
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
|
3829
|
+
for k, v in dict_pred_stack.items():
|
3830
|
+
if k in df_pred.columns:
|
3831
|
+
df_pred[k] = [v]
|
3832
|
+
|
3833
|
+
# # plot the stacking
|
3834
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
3835
|
+
# plot_validate_features_single(df_pred, is_binary=is_binary)
|
3836
|
+
# if dir_save:
|
3837
|
+
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3838
|
+
if vote:
|
3839
|
+
print(f"⤵ voting...")
|
3840
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3841
|
+
#! voting
|
3842
|
+
n_top_models = min(n_top_models, df_results.shape[0])
|
3843
|
+
base_estimators=[]
|
3844
|
+
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3845
|
+
base_estimators.append((name,cls))
|
3846
|
+
# Apply Voting Classifier/Regressor
|
3847
|
+
if purpose == "classification":
|
3848
|
+
print(f"⤵ VotingClassifier...via{voting}")
|
3849
|
+
if voting=='hard':
|
3850
|
+
# Hard voting does not support `predict_proba`
|
3851
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3852
|
+
else:
|
3853
|
+
# Soft voting supports `predict_proba`
|
3854
|
+
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3855
|
+
else:
|
3856
|
+
print(f"⤵ VotingRegressor...")
|
3857
|
+
voting_model = VotingRegressor(estimators=base_estimators)
|
3858
|
+
|
3859
|
+
# Train the Voting Classifier/Regressor
|
3860
|
+
try:
|
3861
|
+
voting_model.fit(x_train, y_train)
|
3862
|
+
y_pred_vote = voting_model.predict(x_true)
|
3863
|
+
except Exception as e:
|
3864
|
+
if purpose == "classification" and not voting=='hard':
|
3865
|
+
voting_model = VotingClassifier(estimators=base_estimators)
|
3866
|
+
voting_model.fit(x_train, y_train)
|
3867
|
+
y_pred_vote = voting_model.predict(x_true)
|
3868
|
+
|
3869
|
+
# Calculate predicted probabilities if applicable
|
3870
|
+
if purpose == "classification":
|
3871
|
+
if hasattr(voting_model, "predict_proba"):
|
3872
|
+
y_pred_proba_vote = voting_model.predict_proba(x_true)
|
3873
|
+
print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
|
3874
|
+
if y_pred_proba_vote.shape[1] == 1:
|
3875
|
+
y_pred_proba_vote = np.hstack(
|
3876
|
+
[1 - y_pred_proba_vote, y_pred_proba_vote]
|
3877
|
+
) # Add missing class probabilities
|
3878
|
+
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3879
|
+
else:
|
3880
|
+
y_pred_proba_vote = None
|
3881
|
+
|
3882
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3883
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3884
|
+
if hasattr(best_clf, "alphas_"):
|
3885
|
+
alphas_ = best_clf.alphas_
|
3886
|
+
elif hasattr(best_clf, "alpha_"):
|
3887
|
+
alphas_ = best_clf.alpha_
|
3888
|
+
elif hasattr(best_clf, "Cs_"):
|
3889
|
+
alphas_ = best_clf.Cs_
|
3890
|
+
else:
|
3891
|
+
alphas_= None
|
3892
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3893
|
+
else: # Regression
|
3894
|
+
y_pred_proba_vote = None
|
3895
|
+
coef_,alphas_=None,None
|
3896
|
+
|
3897
|
+
print(f"⤵ collecting voting results...")
|
3898
|
+
#! dict_pred_vote
|
3899
|
+
dict_pred_vote = {}
|
3900
|
+
validation_scores_vote = {}
|
3901
|
+
if y_true is not None and y_pred_proba_vote is not None:
|
3902
|
+
validation_scores_vote = cal_metrics(
|
3903
|
+
y_true,
|
3904
|
+
y_pred_vote,
|
3905
|
+
y_pred_proba=y_pred_proba_vote,
|
3906
|
+
is_binary=is_binary,
|
3907
|
+
purpose=purpose,
|
3908
|
+
average="weighted",
|
3909
|
+
)
|
3910
|
+
|
3911
|
+
if is_binary:
|
3912
|
+
if y_pred_proba_vote is not None:
|
3913
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
|
3914
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3915
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3916
|
+
)
|
3917
|
+
roc_auc = auc(fpr, tpr)
|
3918
|
+
roc_info = {
|
3919
|
+
"fpr": fpr.tolist(),
|
3920
|
+
"tpr": tpr.tolist(),
|
3921
|
+
"auc": roc_auc,
|
3922
|
+
"ci95": (lower_ci, upper_ci),
|
3923
|
+
}
|
3924
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
|
3925
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
|
3926
|
+
pr_info = {
|
3927
|
+
"precision": precision_,
|
3928
|
+
"recall": recall_,
|
3929
|
+
"avg_precision": avg_precision_,
|
3930
|
+
}
|
3931
|
+
else:
|
3932
|
+
roc_info, pr_info = None, None
|
3933
|
+
|
3934
|
+
dict_pred_vote = {
|
3935
|
+
"best_clf": voting_model,
|
3936
|
+
"best_params": None,
|
3937
|
+
"auc_indiv": None,
|
3938
|
+
"scores": validation_scores_vote,
|
3939
|
+
"roc_curve": roc_info,
|
3940
|
+
"pr_curve": pr_info,
|
3941
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3942
|
+
"predictions": y_pred_vote.tolist(),
|
3943
|
+
"predictions_proba": (
|
3944
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3945
|
+
),
|
3946
|
+
"features":share_col_names,
|
3947
|
+
"coef":coef_,
|
3948
|
+
"alphas":alphas_
|
3949
|
+
}
|
3950
|
+
else: # Multi-class
|
3951
|
+
if y_pred_proba_vote is not None:
|
3952
|
+
confidence_intervals = cal_auc_ci(
|
3953
|
+
y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
|
3954
|
+
)
|
3955
|
+
roc_info = {
|
3956
|
+
"fpr": validation_scores_vote["fpr"],
|
3957
|
+
"tpr": validation_scores_vote["tpr"],
|
3958
|
+
"auc": validation_scores_vote["roc_auc_by_class"],
|
3959
|
+
"ci95": confidence_intervals,
|
3960
|
+
}
|
3961
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3962
|
+
y_true, y_pred_proba_vote, is_binary=is_binary
|
3963
|
+
)
|
3964
|
+
pr_info = {
|
3965
|
+
"precision": precision_,
|
3966
|
+
"recall": recall_,
|
3967
|
+
"avg_precision": avg_precision_,
|
3968
|
+
}
|
3969
|
+
else:
|
3970
|
+
roc_info, pr_info = None, None
|
3971
|
+
|
3972
|
+
dict_pred_vote = {
|
3973
|
+
"best_clf": voting_model,
|
3974
|
+
"best_params": None,
|
3975
|
+
"scores": validation_scores_vote,
|
3976
|
+
"roc_curve": roc_info,
|
3977
|
+
"pr_curve": pr_info,
|
3978
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred_vote),
|
3979
|
+
"predictions": y_pred_vote.tolist(),
|
3980
|
+
"predictions_proba": (
|
3981
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3982
|
+
),
|
3983
|
+
"features":share_col_names,
|
3984
|
+
"coef":coef_,
|
3985
|
+
"alphas":alphas_
|
3986
|
+
}
|
3987
|
+
else:
|
3988
|
+
if y_true is None:
|
3989
|
+
validation_scores_vote = []
|
3990
|
+
else:
|
3991
|
+
validation_scores_vote = cal_metrics(
|
3992
|
+
y_true,
|
3993
|
+
y_pred,
|
3994
|
+
y_pred_proba=y_pred_proba_vote,
|
3995
|
+
is_binary=is_binary,
|
3996
|
+
purpose=purpose,
|
3997
|
+
average="weighted",
|
3998
|
+
)
|
3999
|
+
dict_pred_vote = {
|
4000
|
+
"best_clf": voting_model,
|
4001
|
+
"best_params": None,
|
4002
|
+
"scores": validation_scores_vote,
|
4003
|
+
"predictions": y_pred_vote.tolist(),
|
4004
|
+
"predictions_proba": (
|
4005
|
+
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
4006
|
+
),
|
4007
|
+
"features":share_col_names,
|
4008
|
+
"y_train": y_train if y_train is not None else [],
|
4009
|
+
"y_true": y_true if y_true is not None else [],
|
4010
|
+
}
|
4011
|
+
df_vote = pd.DataFrame(
|
4012
|
+
[None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
|
4013
|
+
for k, v in dict_pred_vote.items():
|
4014
|
+
if k in df_vote.columns:
|
4015
|
+
df_vote[k] = [v]
|
4016
|
+
|
4017
|
+
# if all([plot_, y_true is not None, purpose == "classification"]):
|
4018
|
+
# try:
|
4019
|
+
# plot_validate_features_single(df_vote, is_binary=is_binary)
|
4020
|
+
# if dir_save:
|
4021
|
+
# ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
|
4022
|
+
# except Exception as e:
|
4023
|
+
# print(e)
|
4024
|
+
print("Done")
|
4025
|
+
if vote and stack:
|
4026
|
+
df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
|
4027
|
+
elif vote:
|
4028
|
+
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
4029
|
+
elif stack:
|
4030
|
+
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
4031
|
+
|
3218
4032
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
4033
|
+
from datetime import datetime
|
4034
|
+
|
4035
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3219
4036
|
# try:
|
3220
|
-
if
|
3221
|
-
|
4037
|
+
if df_res.shape[0] > 3:
|
4038
|
+
try:
|
4039
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
4040
|
+
except Exception as e:
|
4041
|
+
print(e)
|
3222
4042
|
else:
|
3223
|
-
|
4043
|
+
try:
|
4044
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
4045
|
+
except Exception as e:
|
4046
|
+
print(e)
|
3224
4047
|
if dir_save:
|
3225
4048
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3226
|
-
|
3227
|
-
|
3228
|
-
return
|
3229
|
-
|
4049
|
+
# except Exception as e:
|
4050
|
+
# print(f"Error: 在画图的过程中出现了问题:{e}")
|
4051
|
+
return df_res
|
3230
4052
|
|
3231
4053
|
def cal_metrics(
|
3232
4054
|
y_true,
|
@@ -3368,7 +4190,7 @@ def cal_metrics(
|
|
3368
4190
|
|
3369
4191
|
|
3370
4192
|
def plot_trees(
|
3371
|
-
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
4193
|
+
X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3372
4194
|
):
|
3373
4195
|
"""
|
3374
4196
|
# # Example usage:
|
@@ -3414,10 +4236,14 @@ def plot_trees(
|
|
3414
4236
|
train_error_rate = []
|
3415
4237
|
test_error_rate = []
|
3416
4238
|
validation_error = None
|
3417
|
-
|
4239
|
+
if isinstance(cls, str):
|
4240
|
+
cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
|
3418
4241
|
# Configure classifier based on type
|
3419
4242
|
oob_enabled = False # Default to no OOB error unless explicitly set
|
3420
|
-
|
4243
|
+
clf_support = {"RandomForestClassifier":RandomForestClassifier(),
|
4244
|
+
"ExtraTreesClassifier":ExtraTreesClassifier(),
|
4245
|
+
"AdaBoostClassifier":AdaBoostClassifier(),
|
4246
|
+
"GradientBoostingClassifier":GradientBoostingClassifier()}
|
3421
4247
|
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3422
4248
|
# Enable OOB if cls supports it and is using bootstrapping
|
3423
4249
|
cls.set_params(warm_start=True, n_estimators=1)
|
@@ -3679,7 +4505,7 @@ def img_datasets_preprocessing(
|
|
3679
4505
|
|
3680
4506
|
|
3681
4507
|
def backward_regression(
|
3682
|
-
X: pd.DataFrame, y: pd.Series, initial_list=[],
|
4508
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
|
3683
4509
|
):
|
3684
4510
|
"""
|
3685
4511
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
@@ -3691,31 +4517,46 @@ def backward_regression(
|
|
3691
4517
|
X -- features values
|
3692
4518
|
y -- target variable
|
3693
4519
|
initial_list -- features header
|
3694
|
-
|
4520
|
+
thr -- pvalue threshold of features to drop
|
3695
4521
|
verbose -- true to produce lots of logging output
|
3696
4522
|
|
3697
4523
|
Returns:
|
3698
4524
|
list of selected features for modeling
|
3699
4525
|
"""
|
3700
4526
|
import statsmodels.api as sm
|
3701
|
-
|
3702
|
-
|
3703
|
-
|
3704
|
-
|
3705
|
-
|
4527
|
+
if isinstance(y, str):
|
4528
|
+
if y in X.columns:
|
4529
|
+
y_col_name = y
|
4530
|
+
y = X[y]
|
4531
|
+
X = X.drop(y_col_name, axis=1)
|
4532
|
+
else:
|
4533
|
+
raise ValueError(f"找不到{y},y设置有误")
|
4534
|
+
X = X.select_dtypes(include=[np.number])
|
4535
|
+
|
3706
4536
|
included = list(X.columns)
|
4537
|
+
try:
|
4538
|
+
X=X.astype(float)
|
4539
|
+
y=y.astype(float)
|
4540
|
+
except Exception as e:
|
4541
|
+
raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
|
4542
|
+
|
4543
|
+
|
3707
4544
|
while True:
|
3708
4545
|
changed = False
|
4546
|
+
if not included:
|
4547
|
+
print("No features remain in the model.")
|
4548
|
+
break
|
4549
|
+
|
3709
4550
|
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3710
4551
|
# exclude the intercept for p-value checking
|
3711
4552
|
pvalues = model.pvalues.iloc[1:]
|
3712
4553
|
worst_pval = pvalues.max()
|
3713
|
-
if worst_pval >
|
4554
|
+
if worst_pval > thr:
|
3714
4555
|
changed = True
|
3715
4556
|
worst_feature = pvalues.idxmax()
|
3716
4557
|
included.remove(worst_feature)
|
3717
4558
|
if verbose:
|
3718
|
-
print(f"Removing
|
4559
|
+
print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
|
3719
4560
|
if not changed:
|
3720
4561
|
break
|
3721
4562
|
print(f"\nSelected Features:\n{included}")
|