py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
5
5
  BaggingClassifier,
6
6
  )
7
7
  from sklearn.svm import SVC, SVR
8
- from sklearn.calibration import CalibratedClassifierCV
9
8
  from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
9
  from sklearn.linear_model import (
11
10
  LassoCV,
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
16
15
  RidgeClassifierCV,
17
16
  ElasticNet,
18
17
  )
19
- from sklearn.feature_selection import RFE
20
- from sklearn.naive_bayes import GaussianNB
21
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
22
- import xgboost as xgb # Make sure you have xgboost installed
23
-
24
- from sklearn.model_selection import train_test_split, cross_val_score
18
+
25
19
  from sklearn.metrics import (
26
20
  accuracy_score,
27
21
  precision_score,
@@ -36,18 +30,12 @@ from sklearn.metrics import (
36
30
  precision_recall_curve,
37
31
  average_precision_score,
38
32
  )
39
- from imblearn.over_sampling import SMOTE
40
- from sklearn.pipeline import Pipeline
41
- from collections import defaultdict
42
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
43
33
  from typing import Dict, Any, Optional, List, Union
44
34
  import numpy as np
45
35
  import pandas as pd
46
36
  from . import ips
47
37
  from . import plot
48
38
  import matplotlib.pyplot as plt
49
- import seaborn as sns
50
-
51
39
  plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
52
40
  import logging
53
41
  import warnings
@@ -314,6 +302,8 @@ def features_svm(
314
302
  - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
315
303
  S-shaped relationships.
316
304
  """
305
+ from sklearn.feature_selection import RFE
306
+ from sklearn.svm import SVC
317
307
  # SVM (Support Vector Machines)
318
308
  svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
319
309
  # RFE(Recursive Feature Elimination)
@@ -450,6 +440,7 @@ def validate_classifier(
450
440
  Returns:
451
441
  - results: Dictionary containing average cv_train_scores and cv_test_scores.
452
442
  """
443
+ from sklearn.model_selection import cross_val_score
453
444
  cv_train_scores = {metric: [] for metric in metrics}
454
445
  skf = StratifiedKFold(n_splits=cv_folds)
455
446
  # Perform cross-validation
@@ -982,6 +973,8 @@ def validate_features(
982
973
 
983
974
  """
984
975
  from tqdm import tqdm
976
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
977
+ from sklearn.calibration import CalibratedClassifierCV
985
978
 
986
979
  # Ensure common features are selected
987
980
  common_features = ips.shared(
@@ -1001,6 +994,7 @@ def validate_features(
1001
994
 
1002
995
  # Handle class imbalance using SMOTE
1003
996
  if smote:
997
+ from imblearn.over_sampling import SMOTE
1004
998
  if (
1005
999
  y_train.value_counts(normalize=True).max() < 0.8
1006
1000
  ): # Threshold to decide if data is imbalanced
@@ -2096,7 +2090,116 @@ def rank_models(
2096
2090
  # )
2097
2091
 
2098
2092
  # figsave("classifier_performance.pdf")
2093
+ def rank_models_reg(df, ascending=False):
2094
+ """
2095
+ Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
2096
+
2097
+ Parameters:
2098
+ df (pd.DataFrame): DataFrame containing the regression metrics.
2099
+ ascending (bool): Whether to sort in ascending order of ranking score.
2100
+
2101
+ Returns:
2102
+ pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
2103
+ """
2104
+ # Define weights for the 4 metrics
2105
+ weights = {
2106
+ "mse": -1, # Lower is better
2107
+ "rmse": -1, # Lower is better
2108
+ "mae": -1, # Lower is better
2109
+ "r2": 1, # Higher is better
2110
+ }
2111
+
2112
+ # Normalize the selected metrics
2113
+ df = df.copy() # Work on a copy of the DataFrame
2114
+ for metric, weight in weights.items():
2115
+ if metric in df.columns:
2116
+ if weight > 0: # Higher is better; normalize 0-1
2117
+ df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
2118
+ df[metric].max() - df[metric].min()
2119
+ )
2120
+ else: # Lower is better; reverse normalize 0-1
2121
+ df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
2122
+ df[metric].max() - df[metric].min()
2123
+ )
2099
2124
 
2125
+ # Calculate ranking score as a weighted sum
2126
+ df["Ranking_Score"] = sum(
2127
+ df[metric + "_normalized"] * abs(weights[metric])
2128
+ for metric in weights.keys()
2129
+ if metric + "_normalized" in df.columns
2130
+ )
2131
+
2132
+ # Sort models based on the ranking score
2133
+ sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
2134
+ return sorted_df
2135
+
2136
+ models_support = {
2137
+ "classification": {
2138
+ "Random Forest": "Tree-Based",
2139
+ "SVM": "Kernel-Based",
2140
+ "Logistic Regression": "Linear",
2141
+ "Lasso Logistic Regression": "Linear",
2142
+ "Gradient Boosting": "Tree-Based",
2143
+ "XGBoost": "Tree-Based",
2144
+ "KNN": "Instance-Based",
2145
+ "Naive Bayes": "Probabilistic",
2146
+ "Linear Discriminant Analysis": "Linear",
2147
+ "AdaBoost": "Tree-Based",
2148
+ "CatBoost": "Tree-Based",
2149
+ "Extra Trees": "Tree-Based",
2150
+ "Bagging": "Tree-Based",
2151
+ "Neural Network": "Neural Network",
2152
+ "DecisionTree": "Tree-Based",
2153
+ "Quadratic Discriminant Analysis": "Probabilistic",
2154
+ "Ridge": "Linear",
2155
+ "Perceptron": "Linear",
2156
+ "Bernoulli Naive Bayes": "Probabilistic",
2157
+ "SGDClassifier": "Linear",
2158
+ },
2159
+ "regression": {
2160
+ "Linear Regression": "Linear",
2161
+ "Ridge": "Linear",
2162
+ "RidgeCV": "Linear",
2163
+ "TheilSenRegressor": "Linear",
2164
+ "HuberRegressor": "Linear",
2165
+ "PoissonRegressor": "Linear",
2166
+ "LassoCV": "Linear",
2167
+ "Bagging": "Tree-Based",
2168
+ "ElasticNet": "Linear",
2169
+ "Random Forest": "Tree-Based",
2170
+ "Gradient Boosting": "Tree-Based",
2171
+ "XGBoost": "Tree-Based",
2172
+ "CatBoost": "Tree-Based",
2173
+ "Extra Trees": "Tree-Based",
2174
+ "SVM": "Kernel-Based",
2175
+ "KNN": "Instance-Based",
2176
+ "Neural Network": "Neural Network",
2177
+ "AdaBoost": "Linear",
2178
+ },
2179
+ }
2180
+ def select_top_models(models, categories, n_top_models, n_models_per_category=1):
2181
+ """
2182
+ models = list_sort
2183
+ purpose = "regression"
2184
+ categories = models_support[purpose]
2185
+ n_top_models = 3
2186
+ select_top_models(models, categories, n_top_models)
2187
+ """
2188
+ selected = {}
2189
+ result = []
2190
+ for model in models:
2191
+ category = categories.get(model, "Unknown")
2192
+ if category not in selected:
2193
+ selected[category] = 0 # Initialize counter for the category
2194
+
2195
+ if selected[category] < n_models_per_category: # Allow additional models up to the limit
2196
+ selected[category] += 1
2197
+ result.append(model)
2198
+
2199
+ if len(result) == n_top_models: # Stop when the desired number of models is reached
2200
+ break
2201
+
2202
+ return result
2100
2203
 
2101
2204
  def predict(
2102
2205
  x_train: pd.DataFrame,
@@ -2104,11 +2207,17 @@ def predict(
2104
2207
  x_true: pd.DataFrame = None,
2105
2208
  y_true: Optional[pd.Series] = None,
2106
2209
  backward: bool = False, # backward_regression
2210
+ backward_thr:float = 0.05,# pval thr,only works when backward is True
2107
2211
  common_features: set = None,
2108
2212
  purpose: str = "classification", # 'classification' or 'regression'
2109
2213
  cls: Optional[Dict[str, Any]] = None,
2110
2214
  metrics: Optional[List[str]] = None,
2111
- random_state: int = 1,
2215
+ stack:bool=True,# run stacking
2216
+ stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2217
+ vote:bool=True,# run voting
2218
+ voting:str="hard", # only for classification purporse of voting
2219
+ n_top_models:int=5, #for stacking models
2220
+ n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
2112
2221
  smote: bool = False,
2113
2222
  n_jobs: int = -1,
2114
2223
  plot_: bool = True,
@@ -2117,6 +2226,7 @@ def predict(
2117
2226
  cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
2118
2227
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2119
2228
  class_weight: str = "balanced",
2229
+ random_state: int = 1,
2120
2230
  verbose: bool = False,
2121
2231
  ) -> pd.DataFrame:
2122
2232
  """
@@ -2184,10 +2294,17 @@ def predict(
2184
2294
  RidgeClassifierCV,
2185
2295
  Perceptron,
2186
2296
  SGDClassifier,
2297
+ RidgeCV,
2298
+ Ridge,
2299
+ TheilSenRegressor,
2300
+ HuberRegressor,
2301
+ PoissonRegressor,
2302
+
2187
2303
  )
2304
+ from sklearn.compose import TransformedTargetRegressor
2188
2305
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
2189
2306
  from sklearn.naive_bayes import GaussianNB, BernoulliNB
2190
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
2307
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
2191
2308
  import xgboost as xgb
2192
2309
  import lightgbm as lgb
2193
2310
  import catboost as cb
@@ -2198,6 +2315,7 @@ def predict(
2198
2315
  QuadraticDiscriminantAnalysis,
2199
2316
  )
2200
2317
  from sklearn.preprocessing import PolynomialFeatures
2318
+ from sklearn.model_selection import train_test_split
2201
2319
 
2202
2320
  # 拼写检查
2203
2321
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
@@ -2261,7 +2379,6 @@ def predict(
2261
2379
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2262
2380
  "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2263
2381
  "Linear Regression": LinearRegression(),
2264
- "Lasso": Lasso(random_state=random_state),
2265
2382
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2266
2383
  # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2267
2384
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
@@ -2271,10 +2388,10 @@ def predict(
2271
2388
  "ElasticNet": ElasticNet(random_state=random_state),
2272
2389
  "Ridge": Ridge(),
2273
2390
  "KNN": KNeighborsRegressor(),
2391
+ "TheilSen":TheilSenRegressor(),
2392
+ "Huber":HuberRegressor(),
2393
+ "Poisson":PoissonRegressor()
2274
2394
  }
2275
- # indicate cls:
2276
- if ips.run_once_within(30): # 10 min
2277
- print(f"supported models: {list(model_.keys())}")
2278
2395
  if cls is None:
2279
2396
  models = model_
2280
2397
  else:
@@ -2290,6 +2407,10 @@ def predict(
2290
2407
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2291
2408
  )
2292
2409
 
2410
+ # indicate cls:
2411
+ if ips.run_once_within(30): # 10 min
2412
+ print(f"processing: {list(models.keys())}")
2413
+
2293
2414
  if isinstance(y_train, str) and y_train in x_train.columns:
2294
2415
  y_train_col_name = y_train
2295
2416
  y_train = x_train[y_train]
@@ -2311,7 +2432,7 @@ def predict(
2311
2432
 
2312
2433
  # Perform backward feature selection
2313
2434
  if backward:
2314
- selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2435
+ selected_features = backward_regression(x_train, y_train, thr=backward_thr)
2315
2436
  x_train = x_train[selected_features]
2316
2437
 
2317
2438
  if x_true is None:
@@ -2396,6 +2517,17 @@ def predict(
2396
2517
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2397
2518
  y_true = np.asarray(y_true)
2398
2519
  # Hyperparameter grids for tuning
2520
+ param_grid_common_xgb = {
2521
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2522
+ 'max_depth': [3, 5, 7, 10],
2523
+ 'n_estimators': [50, 100, 200, 300],
2524
+ 'subsample': [0.6, 0.8, 1.0],
2525
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2526
+ 'gamma': [0, 0.1, 0.2, 0.5],
2527
+ 'min_child_weight': [1, 5, 10],
2528
+ 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2529
+ 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2530
+ }
2399
2531
  if cv_level in ["low", "simple", "s", "l"]:
2400
2532
  param_grids = {
2401
2533
  "Random Forest": (
@@ -2440,12 +2572,17 @@ def predict(
2440
2572
  "min_samples_split": [2],
2441
2573
  "subsample": [0.8],
2442
2574
  },
2443
- "XGBoost": {
2444
- "n_estimators": [100],
2445
- "max_depth": [3],
2446
- "learning_rate": [0.1],
2447
- "subsample": [0.8],
2448
- "colsample_bytree": [0.8],
2575
+ "XGBoost":{
2576
+ 'learning_rate': [0.01],
2577
+ 'max_depth': [3],
2578
+ 'n_estimators': [50],
2579
+ 'subsample': [0.6],
2580
+ 'colsample_bytree': [0.6],
2581
+ 'gamma': [0, 0.1],
2582
+ 'min_child_weight': [1],
2583
+ 'reg_alpha': [0, 0.1],
2584
+ 'reg_lambda': [1],
2585
+ 'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
2449
2586
  },
2450
2587
  "KNN": (
2451
2588
  {
@@ -2552,6 +2689,14 @@ def predict(
2552
2689
  "random_state": [random_state],
2553
2690
  "learning_rate": ["constant"],
2554
2691
  },
2692
+ "TheilSen":{'max_iter': [100],
2693
+ 'tol': [1e-4],
2694
+ 'n_subsamples': [100+x_train.shape[1]]},
2695
+ "Huber":{'epsilon': [1.35],
2696
+ 'alpha': [0.1],
2697
+ 'max_iter': [100],},
2698
+ "Poisson":{'alpha': [0.1],
2699
+ 'max_iter': [100],}
2555
2700
  }
2556
2701
  elif cv_level in ["high", "advanced", "h"]:
2557
2702
  param_grids = {
@@ -2613,12 +2758,30 @@ def predict(
2613
2758
  "subsample": [0.8, 1.0],
2614
2759
  },
2615
2760
  "XGBoost": {
2616
- "n_estimators": [100, 200, 500, 700],
2617
- "max_depth": [3, 5, 7, 10],
2618
- "learning_rate": [0.01, 0.1, 0.2, 0.3],
2619
- "subsample": [0.8, 1.0],
2620
- "colsample_bytree": [0.8, 0.9, 1.0],
2621
- },
2761
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2762
+ 'max_depth': [3, 5, 7, 10],
2763
+ 'n_estimators': [50, 100, 200, 300],
2764
+ 'subsample': [0.6, 0.8, 1.0],
2765
+ 'gamma': [0, 0.1, 0.2, 0.5],
2766
+ 'min_child_weight': [1, 5, 10],
2767
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2768
+ 'reg_lambda': [1, 1.5, 2],
2769
+ **{
2770
+ 'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
2771
+ }} if purpose== "classification"
2772
+ else{
2773
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2774
+ 'max_depth': [3, 5, 7, 10],
2775
+ 'n_estimators': [50, 100, 200, 300],
2776
+ 'subsample': [0.6, 0.8, 1.0],
2777
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2778
+ 'gamma': [0, 0.1, 0.2, 0.5],
2779
+ 'min_child_weight': [1, 5, 10],
2780
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2781
+ 'reg_lambda': [1, 1.5, 2],
2782
+ **{
2783
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
2784
+ }},
2622
2785
  "KNN": (
2623
2786
  {
2624
2787
  "n_neighbors": [1, 3, 5, 10, 15, 20],
@@ -2731,6 +2894,14 @@ def predict(
2731
2894
  ], # If True, the regressors X will be normalized
2732
2895
  }
2733
2896
  ),
2897
+ "TheilSen":{'max_iter': [100, 200, 300],
2898
+ 'tol': [1e-4, 1e-3, 1e-2],
2899
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
2900
+ "Huber":{'epsilon': [1.35, 1.5, 2.0],
2901
+ 'alpha': [0.1, 1.0, 10.0],
2902
+ 'max_iter': [100, 200, 300],},
2903
+ "Poisson":{'alpha': [0.1, 1.0, 10.0],
2904
+ 'max_iter': [100, 200, 300],}
2734
2905
  }
2735
2906
  else: # median level
2736
2907
  param_grids = {
@@ -2790,12 +2961,30 @@ def predict(
2790
2961
  "subsample": [0.8, 1.0],
2791
2962
  },
2792
2963
  "XGBoost": {
2793
- "n_estimators": [100, 200, 500],
2794
- "max_depth": [3, 5, 7],
2795
- "learning_rate": [0.01, 0.1, 0.2],
2796
- "subsample": [0.8, 1.0],
2797
- "colsample_bytree": [0.8, 1.0],
2798
- },
2964
+ 'learning_rate': [0.01, 0.1],
2965
+ 'max_depth': [3, 5],
2966
+ 'n_estimators': [50, 100],
2967
+ 'subsample': [0.6, 0.8],
2968
+ 'gamma': [0, 0.1],
2969
+ 'min_child_weight': [1, 5],
2970
+ 'reg_alpha': [0, 0.1],
2971
+ 'reg_lambda': [1,],
2972
+ **{
2973
+ 'objective': ['binary:logistic', 'multi:softmax'],
2974
+ }} if purpose== "classification"
2975
+ else{
2976
+ 'learning_rate': [0.01, 0.1],
2977
+ 'max_depth': [3, 5,],
2978
+ 'n_estimators': [50, 100],
2979
+ 'subsample': [0.6, 0.8],
2980
+ 'colsample_bytree': [0.6, 0.8],
2981
+ 'gamma': [0, 0.1],
2982
+ 'min_child_weight': [1, 5],
2983
+ 'reg_alpha': [0, 0.1],
2984
+ 'reg_lambda': [1, 1.5],
2985
+ **{
2986
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror'],
2987
+ }},
2799
2988
  "KNN": (
2800
2989
  {
2801
2990
  "n_neighbors": [3, 5, 7, 10],
@@ -2952,6 +3141,14 @@ def predict(
2952
3141
  ], # Solver for optimization
2953
3142
  }
2954
3143
  ),
3144
+ "TheilSen":{'max_iter': [100, 200],
3145
+ 'tol': [1e-4, 1e-3],
3146
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
3147
+ "Huber":{'epsilon': [1.35, 1.5],
3148
+ 'alpha': [0.1, 1.0],
3149
+ 'max_iter': [100, 200],},
3150
+ "Poisson":{'alpha': [0.1, 1.0],
3151
+ 'max_iter': [100, 200],}
2955
3152
  }
2956
3153
 
2957
3154
  results = {}
@@ -3192,12 +3389,18 @@ def predict(
3192
3389
  # Convert results to DataFrame
3193
3390
  df_results = pd.DataFrame.from_dict(results, orient="index")
3194
3391
  # sort
3195
- if y_true is not None and purpose == "classification":
3196
- df_scores = pd.DataFrame(
3197
- df_results["scores"].tolist(), index=df_results["scores"].index
3198
- ).sort_values(by="roc_auc", ascending=False)
3392
+ if y_true is not None:
3393
+ if purpose == "classification":
3394
+ df_scores = pd.DataFrame(
3395
+ df_results["scores"].tolist(), index=df_results["scores"].index
3396
+ ).sort_values(by="roc_auc", ascending=False)
3397
+ elif purpose=='regression':
3398
+ df_scores = rank_models_reg(
3399
+ pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
3400
+ ascending=False)
3199
3401
  df_results = df_results.loc[df_scores.index]
3200
3402
 
3403
+ if y_true is not None and purpose == "classification":
3201
3404
  if plot_:
3202
3405
  from datetime import datetime
3203
3406
 
@@ -3215,18 +3418,503 @@ def predict(
3215
3418
  plot.figsets(xangle=30)
3216
3419
  if dir_save:
3217
3420
  ips.figsave(dir_save + f"scores_clus{now_}.pdf")
3421
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3422
+ # # try:
3423
+ # if len(models) > 3:
3424
+ # plot_validate_features(df_results, is_binary=is_binary)
3425
+ # else:
3426
+ # plot_validate_features_single(df_results, is_binary=is_binary)
3427
+ # if dir_save:
3428
+ # ips.figsave(dir_save + f"validate_features{now_}.pdf")
3429
+ # # except Exception as e:
3430
+ # # print(f"Error: 在画图的过程中出现了问题:{e}")
3431
+ if stack:
3432
+ #! stacking classifier/regressor
3433
+ from sklearn.metrics import make_scorer, accuracy_score
3434
+ from sklearn.model_selection import cross_val_score
3435
+
3436
+ #* n_top_models防止超过index
3437
+ n_top_models = min(n_top_models, df_results.shape[0])
3438
+
3439
+ #* 选择出排名靠前的n个, estimators
3440
+ models_selecte = select_top_models(models=list(df_results.index),
3441
+ categories=models_support[purpose],
3442
+ n_top_models=n_top_models,
3443
+ n_models_per_category=n_models_per_category)
3444
+ top_models = df_results.loc[models_selecte]["best_clf"]
3445
+ base_estimators = []
3446
+ for i, j in top_models.to_dict().items():
3447
+ base_estimators.append((i, j))
3448
+ if stacking_cv:
3449
+ print(f" ⤵ stacking_cv is processing...")
3450
+ #* 定义几个象征性的final_estimator
3451
+ # 备选的几种
3452
+ if purpose == "classification":
3453
+ kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
3454
+ else:
3455
+ kadt_estimators=["XGBoost","LassoCV"]
3456
+ final_estimators={}
3457
+ for name in kadt_estimators:
3458
+ param_grid=param_grids.get(name, {})
3459
+ print(param_grid)
3460
+ if is_binary:
3461
+ gs = GridSearchCV(
3462
+ model_[name],
3463
+ param_grid=param_grid,
3464
+ scoring=(
3465
+ "roc_auc"
3466
+ if purpose == "classification"
3467
+ else "neg_mean_squared_error"
3468
+ ),
3469
+ cv=cv,
3470
+ n_jobs=n_jobs,
3471
+ verbose=verbose,
3472
+ )
3473
+ else:
3474
+ gs = GridSearchCV(
3475
+ model_[name],
3476
+ param_grid=param_grid,
3477
+ scoring=(
3478
+ "roc_auc_ovr"
3479
+ if purpose == "classification"
3480
+ else "neg_mean_squared_error"
3481
+ ),
3482
+ cv=cv,
3483
+ n_jobs=n_jobs,
3484
+ verbose=verbose,
3485
+ )
3486
+ # Fit GridSearchCV
3487
+ gs.fit(x_train, y_train)
3488
+ final_estimators[name]=gs.best_estimator_
3489
+
3490
+ #* Set up cross-validation and performance evaluation
3491
+ scorer = make_scorer(accuracy_score)
3492
+ cv_results = []
3493
+
3494
+ #*Cross-validate stacking models with different final estimators
3495
+ for final_name, final_estimator in final_estimators.items():
3496
+ print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
3497
+ if purpose == "classification":
3498
+ stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
3499
+ else:
3500
+ stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
3501
+
3502
+ scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
3503
+
3504
+ # Store the result
3505
+ cv_results.append({
3506
+ 'final_estimator':final_estimator,
3507
+ 'Final Estimator': final_name,
3508
+ 'Mean Accuracy': np.mean(scores),
3509
+ 'Standard Deviation': np.std(scores)
3510
+ })
3511
+
3512
+ #* Convert the results into a DataFrame for easy comparison
3513
+ cv_results_df = pd.DataFrame(cv_results)
3514
+
3515
+ #* Sort and display the best model
3516
+ cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
3517
+
3518
+
3519
+ # Optionally: Select the final estimator that gives the best performance
3520
+ best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3521
+ print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3522
+ else:
3523
+ print(f" ⤵ trying to find the best_final_estimator for stacking...")
3524
+ if purpose=="classification":
3525
+ best_final_estimator = LogisticRegression(class_weight=class_weight,
3526
+ random_state=random_state,
3527
+ max_iter=1000)
3528
+ else:
3529
+ best_final_estimator = RidgeCV(cv=5)
3530
+ print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3531
+ #! apply stacking
3532
+ if purpose == "classification":
3533
+ print(f" ⤵ StackingClassifier...")
3534
+ stacking_model = StackingClassifier(estimators=base_estimators,
3535
+ final_estimator=best_final_estimator,
3536
+ cv=cv)
3537
+ else:
3538
+ print(f" ⤵ StackingRegressor...")
3539
+ stacking_model = StackingRegressor(estimators=base_estimators,
3540
+ final_estimator=best_final_estimator,
3541
+ cv=cv)
3542
+
3543
+ # Train the Stacking Classifier
3544
+ print(f" ⤵ fit & predict...")
3545
+ stacking_model.fit(x_train, y_train)
3546
+ y_pred_final = stacking_model.predict(x_true)
3547
+ print(f" ⤵ collecting results...")
3548
+ # pred_proba
3549
+ if is_binary:
3550
+ if hasattr(stacking_model, "predict_proba"):
3551
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3552
+ print("Shape of predicted probabilities:", y_pred_proba_final.shape)
3553
+ if y_pred_proba_final.shape[1] == 1:
3554
+ y_pred_proba_final = np.hstack(
3555
+ [1 - y_pred_proba_final, y_pred_proba_final]
3556
+ ) # Add missing class probabilities
3557
+ y_pred_proba_final = y_pred_proba_final[:, 1]
3558
+ elif hasattr(stacking_model, "decision_function"):
3559
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3560
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3561
+ # Ensure y_pred_proba_final is within 0 and 1 bounds
3562
+ y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
3563
+ y_pred_proba_final.max() - y_pred_proba_final.min()
3564
+ )
3565
+ else:
3566
+ y_pred_proba_final = None # No probability output for certain models
3567
+ if not is_binary:
3568
+ # Handle prediction probabilities for multiclass
3569
+ if hasattr(stacking_model, "predict_proba"):
3570
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3571
+ elif hasattr(stacking_model, "decision_function"):
3572
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3573
+
3574
+ # Normalize for multiclass if necessary
3575
+ if y_pred_proba_final.ndim == 2:
3576
+ y_pred_proba_final = (
3577
+ y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
3578
+ ) / (
3579
+ y_pred_proba_final.max(axis=1, keepdims=True)
3580
+ - y_pred_proba_final.min(axis=1, keepdims=True)
3581
+ )
3582
+ else:
3583
+ y_pred_proba_final = None # No probability output for certain models
3584
+ #! dict_pred_stack
3585
+ dict_pred_stack={}
3586
+ validation_scores_final = {}
3587
+ if y_true is not None and y_pred_proba_final is not None:
3588
+ validation_scores_final = cal_metrics(
3589
+ y_true,
3590
+ y_pred_final,
3591
+ y_pred_proba=y_pred_proba_final,
3592
+ is_binary=is_binary,
3593
+ purpose=purpose,
3594
+ average="weighted",
3595
+ )
3596
+ if is_binary:
3597
+ # Calculate ROC curve
3598
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
3599
+ if y_pred_proba_final is not None:
3600
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3601
+ lower_ci, upper_ci = cal_auc_ci(
3602
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3603
+ )
3604
+ roc_auc = auc(fpr, tpr)
3605
+ roc_info = {
3606
+ "fpr": fpr.tolist(),
3607
+ "tpr": tpr.tolist(),
3608
+ "auc": roc_auc,
3609
+ "ci95": (lower_ci, upper_ci),
3610
+ }
3611
+ # precision-recall curve
3612
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
3613
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
3614
+ pr_info = {
3615
+ "precision": precision_,
3616
+ "recall": recall_,
3617
+ "avg_precision": avg_precision_,
3618
+ }
3619
+ else:
3620
+ roc_info, pr_info = None, None
3621
+ if purpose == "classification":
3622
+ dict_pred_stack = {
3623
+ "best_clf": stacking_model,
3624
+ "best_params": None,
3625
+ "auc_indiv": None,
3626
+ "scores": validation_scores_final,
3627
+ "roc_curve": roc_info,
3628
+ "pr_curve": pr_info,
3629
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3630
+ "predictions": y_pred_final.tolist(),
3631
+ "predictions_proba": (
3632
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3633
+ ),
3634
+ }
3635
+ else: # "regression"
3636
+ dict_pred_stack = {
3637
+ "best_clf": stacking_model,
3638
+ "best_params": None,
3639
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3640
+ "predictions": y_pred_final.tolist(),
3641
+ "predictions_proba": (
3642
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3643
+ ),
3644
+ }
3645
+ else: # multi-classes
3646
+ if y_pred_proba_final is not None:
3647
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
3648
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3649
+ confidence_intervals = cal_auc_ci(
3650
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3651
+ )
3652
+ roc_info = {
3653
+ "fpr": validation_scores_final["fpr"],
3654
+ "tpr": validation_scores_final["tpr"],
3655
+ "auc": validation_scores_final["roc_auc_by_class"],
3656
+ "ci95": confidence_intervals,
3657
+ }
3658
+ # precision-recall curve
3659
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3660
+ y_true, y_pred_proba_final, is_binary=is_binary
3661
+ )
3662
+ pr_info = {
3663
+ "precision": precision_,
3664
+ "recall": recall_,
3665
+ "avg_precision": avg_precision_,
3666
+ }
3667
+ else:
3668
+ roc_info, pr_info = None, None
3669
+
3670
+ if purpose == "classification":
3671
+ dict_pred_stack = {
3672
+ "best_clf": stacking_model,
3673
+ "best_params": None,
3674
+ "auc_indiv": None,
3675
+ "scores": validation_scores_final,
3676
+ "roc_curve": roc_info,
3677
+ "pr_curve": pr_info,
3678
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3679
+ "predictions": y_pred_final.tolist(),
3680
+ "predictions_proba": (
3681
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3682
+ ),
3683
+ }
3684
+ else: # "regression"
3685
+ dict_pred_stack = {
3686
+ "best_clf": stacking_model,
3687
+ "best_params": None,
3688
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3689
+ "predictions": y_pred_final.tolist(),
3690
+ "predictions_proba": (
3691
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3692
+ ),
3693
+ }
3694
+
3695
+ else:
3696
+ if y_true is None:
3697
+ validation_scores_final = []
3698
+ else:
3699
+ validation_scores_final = cal_metrics(
3700
+ y_true,
3701
+ y_pred,
3702
+ y_pred_proba=y_pred_proba_final,
3703
+ is_binary=is_binary,
3704
+ purpose=purpose,
3705
+ average="weighted",
3706
+ )
3707
+ dict_pred_stack = {
3708
+ "best_clf": stacking_model,
3709
+ "best_params": None,
3710
+ "scores": validation_scores_final,
3711
+ "predictions": y_pred_final.tolist(),
3712
+ "predictions_proba": (
3713
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3714
+ ),
3715
+ "y_train": y_train if y_train is not None else [],
3716
+ "y_true": y_true if y_true is not None else [],
3717
+ }
3718
+ # merge together
3719
+ df_pred = pd.DataFrame(
3720
+ [None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
3721
+ for k, v in dict_pred_stack.items():
3722
+ if k in df_pred.columns:
3723
+ df_pred[k] = [v]
3724
+
3725
+ # # plot the stacking
3726
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3727
+ # plot_validate_features_single(df_pred, is_binary=is_binary)
3728
+ # if dir_save:
3729
+ # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3730
+ if vote:
3731
+ print(f" ⤵ voting...")
3732
+ from sklearn.ensemble import VotingClassifier, VotingRegressor
3733
+ #! Votting
3734
+ n_top_models = min(n_top_models, df_results.shape[0])
3735
+ base_estimators=[]
3736
+ for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3737
+ base_estimators.append((name,cls))
3738
+ # Apply Voting Classifier/Regressor
3739
+ if purpose == "classification":
3740
+ print(f" ⤵ VotingClassifier...via{votting}")
3741
+ if voting=='hard':
3742
+ # Hard voting does not support `predict_proba`
3743
+ voting_model = VotingClassifier(estimators=base_estimators)
3744
+ else:
3745
+ # Soft voting supports `predict_proba`
3746
+ voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3747
+ else:
3748
+ print(f" ⤵ VotingRegressor...")
3749
+ voting_model = VotingRegressor(estimators=base_estimators)
3750
+
3751
+ # Train the Voting Classifier/Regressor
3752
+ try:
3753
+ voting_model.fit(x_train, y_train)
3754
+ y_pred_vote = voting_model.predict(x_true)
3755
+ except Exception as e:
3756
+ if purpose == "classification" and not voting=='hard':
3757
+ voting_model = VotingClassifier(estimators=base_estimators)
3758
+ voting_model.fit(x_train, y_train)
3759
+ y_pred_vote = voting_model.predict(x_true)
3760
+
3761
+ # Calculate predicted probabilities if applicable
3762
+ if purpose == "classification":
3763
+ if hasattr(voting_model, "predict_proba"):
3764
+ y_pred_proba_vote = voting_model.predict_proba(x_true)
3765
+ print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
3766
+ if y_pred_proba_vote.shape[1] == 1:
3767
+ y_pred_proba_vote = np.hstack(
3768
+ [1 - y_pred_proba_vote, y_pred_proba_vote]
3769
+ ) # Add missing class probabilities
3770
+ y_pred_proba_vote = y_pred_proba_vote[:, 1]
3771
+ else:
3772
+ y_pred_proba_vote = None
3773
+ else: # Regression
3774
+ y_pred_proba_vote = None
3775
+
3776
+ print(f" ⤵ collecting voting results...")
3777
+ #! dict_pred_vote
3778
+ dict_pred_vote = {}
3779
+ validation_scores_vote = {}
3780
+ if y_true is not None and y_pred_proba_vote is not None:
3781
+ validation_scores_vote = cal_metrics(
3782
+ y_true,
3783
+ y_pred_vote,
3784
+ y_pred_proba=y_pred_proba_vote,
3785
+ is_binary=is_binary,
3786
+ purpose=purpose,
3787
+ average="weighted",
3788
+ )
3789
+
3790
+ if is_binary:
3791
+ if y_pred_proba_vote is not None:
3792
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
3793
+ lower_ci, upper_ci = cal_auc_ci(
3794
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3795
+ )
3796
+ roc_auc = auc(fpr, tpr)
3797
+ roc_info = {
3798
+ "fpr": fpr.tolist(),
3799
+ "tpr": tpr.tolist(),
3800
+ "auc": roc_auc,
3801
+ "ci95": (lower_ci, upper_ci),
3802
+ }
3803
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
3804
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
3805
+ pr_info = {
3806
+ "precision": precision_,
3807
+ "recall": recall_,
3808
+ "avg_precision": avg_precision_,
3809
+ }
3810
+ else:
3811
+ roc_info, pr_info = None, None
3812
+
3813
+ dict_pred_vote = {
3814
+ "best_clf": voting_model,
3815
+ "best_params": None,
3816
+ "auc_indiv": None,
3817
+ "scores": validation_scores_vote,
3818
+ "roc_curve": roc_info,
3819
+ "pr_curve": pr_info,
3820
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3821
+ "predictions": y_pred_vote.tolist(),
3822
+ "predictions_proba": (
3823
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3824
+ ),
3825
+ }
3826
+ else: # Multi-class
3827
+ if y_pred_proba_vote is not None:
3828
+ confidence_intervals = cal_auc_ci(
3829
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3830
+ )
3831
+ roc_info = {
3832
+ "fpr": validation_scores_vote["fpr"],
3833
+ "tpr": validation_scores_vote["tpr"],
3834
+ "auc": validation_scores_vote["roc_auc_by_class"],
3835
+ "ci95": confidence_intervals,
3836
+ }
3837
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3838
+ y_true, y_pred_proba_vote, is_binary=is_binary
3839
+ )
3840
+ pr_info = {
3841
+ "precision": precision_,
3842
+ "recall": recall_,
3843
+ "avg_precision": avg_precision_,
3844
+ }
3845
+ else:
3846
+ roc_info, pr_info = None, None
3847
+
3848
+ dict_pred_vote = {
3849
+ "best_clf": voting_model,
3850
+ "best_params": None,
3851
+ "scores": validation_scores_vote,
3852
+ "roc_curve": roc_info,
3853
+ "pr_curve": pr_info,
3854
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3855
+ "predictions": y_pred_vote.tolist(),
3856
+ "predictions_proba": (
3857
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3858
+ ),
3859
+ }
3860
+ else:
3861
+ if y_true is None:
3862
+ validation_scores_vote = []
3863
+ else:
3864
+ validation_scores_vote = cal_metrics(
3865
+ y_true,
3866
+ y_pred,
3867
+ y_pred_proba=y_pred_proba_vote,
3868
+ is_binary=is_binary,
3869
+ purpose=purpose,
3870
+ average="weighted",
3871
+ )
3872
+ dict_pred_vote = {
3873
+ "best_clf": voting_model,
3874
+ "best_params": None,
3875
+ "scores": validation_scores_vote,
3876
+ "predictions": y_pred_vote.tolist(),
3877
+ "predictions_proba": (
3878
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3879
+ ),
3880
+ "y_train": y_train if y_train is not None else [],
3881
+ "y_true": y_true if y_true is not None else [],
3882
+ }
3883
+ df_vote = pd.DataFrame(
3884
+ [None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
3885
+ for k, v in dict_pred_vote.items():
3886
+ if k in df_vote.columns:
3887
+ df_vote[k] = [v]
3888
+
3889
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3890
+ # try:
3891
+ # plot_validate_features_single(df_vote, is_binary=is_binary)
3892
+ # if dir_save:
3893
+ # ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
3894
+ # except Exception as e:
3895
+ # print(e)
3896
+ print("Done")
3897
+ if vote and stack:
3898
+ df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
3899
+ elif vote:
3900
+ df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
3901
+ elif stack:
3902
+ df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
3903
+
3218
3904
  if all([plot_, y_true is not None, purpose == "classification"]):
3905
+ from datetime import datetime
3906
+
3907
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3219
3908
  # try:
3220
- if len(models) > 3:
3221
- plot_validate_features(df_results, is_binary=is_binary)
3909
+ if df_res.shape[0] > 3:
3910
+ plot_validate_features(df_res, is_binary=is_binary)
3222
3911
  else:
3223
- plot_validate_features_single(df_results, is_binary=is_binary)
3912
+ plot_validate_features_single(df_res, is_binary=is_binary)
3224
3913
  if dir_save:
3225
3914
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3226
- # except Exception as e:
3227
- # print(f"Error: 在画图的过程中出现了问题:{e}")
3228
- return df_results
3229
-
3915
+ # except Exception as e:
3916
+ # print(f"Error: 在画图的过程中出现了问题:{e}")
3917
+ return df_res
3230
3918
 
3231
3919
  def cal_metrics(
3232
3920
  y_true,
@@ -3368,7 +4056,7 @@ def cal_metrics(
3368
4056
 
3369
4057
 
3370
4058
  def plot_trees(
3371
- X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
4059
+ X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3372
4060
  ):
3373
4061
  """
3374
4062
  # # Example usage:
@@ -3414,10 +4102,14 @@ def plot_trees(
3414
4102
  train_error_rate = []
3415
4103
  test_error_rate = []
3416
4104
  validation_error = None
3417
-
4105
+ if isinstance(cls, str):
4106
+ cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
3418
4107
  # Configure classifier based on type
3419
4108
  oob_enabled = False # Default to no OOB error unless explicitly set
3420
-
4109
+ clf_support = {"RandomForestClassifier":RandomForestClassifier(),
4110
+ "ExtraTreesClassifier":ExtraTreesClassifier(),
4111
+ "AdaBoostClassifier":AdaBoostClassifier(),
4112
+ "GradientBoostingClassifier":GradientBoostingClassifier()}
3421
4113
  if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3422
4114
  # Enable OOB if cls supports it and is using bootstrapping
3423
4115
  cls.set_params(warm_start=True, n_estimators=1)
@@ -3679,7 +4371,7 @@ def img_datasets_preprocessing(
3679
4371
 
3680
4372
 
3681
4373
  def backward_regression(
3682
- X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
4374
+ X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
3683
4375
  ):
3684
4376
  """
3685
4377
  # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
@@ -3691,31 +4383,46 @@ def backward_regression(
3691
4383
  X -- features values
3692
4384
  y -- target variable
3693
4385
  initial_list -- features header
3694
- threshold_out -- pvalue threshold of features to drop
4386
+ thr -- pvalue threshold of features to drop
3695
4387
  verbose -- true to produce lots of logging output
3696
4388
 
3697
4389
  Returns:
3698
4390
  list of selected features for modeling
3699
4391
  """
3700
4392
  import statsmodels.api as sm
3701
-
3702
- if isinstance(y, str) and y in X.columns:
3703
- y_col_name = y
3704
- y = X[y]
3705
- X = X.drop(y_col_name, axis=1)
4393
+ if isinstance(y, str):
4394
+ if y in X.columns:
4395
+ y_col_name = y
4396
+ y = X[y]
4397
+ X = X.drop(y_col_name, axis=1)
4398
+ else:
4399
+ raise ValueError(f"找不到{y},y设置有误")
4400
+ X = X.select_dtypes(include=[np.number])
4401
+
3706
4402
  included = list(X.columns)
4403
+ try:
4404
+ X=X.astype(float)
4405
+ y=y.astype(float)
4406
+ except Exception as e:
4407
+ raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
4408
+
4409
+
3707
4410
  while True:
3708
4411
  changed = False
4412
+ if not included:
4413
+ print("No features remain in the model.")
4414
+ break
4415
+
3709
4416
  model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3710
4417
  # exclude the intercept for p-value checking
3711
4418
  pvalues = model.pvalues.iloc[1:]
3712
4419
  worst_pval = pvalues.max()
3713
- if worst_pval > threshold_out:
4420
+ if worst_pval > thr:
3714
4421
  changed = True
3715
4422
  worst_feature = pvalues.idxmax()
3716
4423
  included.remove(worst_feature)
3717
4424
  if verbose:
3718
- print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
4425
+ print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
3719
4426
  if not changed:
3720
4427
  break
3721
4428
  print(f"\nSelected Features:\n{included}")