py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
5
5
  BaggingClassifier,
6
6
  )
7
7
  from sklearn.svm import SVC, SVR
8
- from sklearn.calibration import CalibratedClassifierCV
9
8
  from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
9
  from sklearn.linear_model import (
11
10
  LassoCV,
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
16
15
  RidgeClassifierCV,
17
16
  ElasticNet,
18
17
  )
19
- from sklearn.feature_selection import RFE
20
- from sklearn.naive_bayes import GaussianNB
21
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
22
- import xgboost as xgb # Make sure you have xgboost installed
23
-
24
- from sklearn.model_selection import train_test_split, cross_val_score
18
+
25
19
  from sklearn.metrics import (
26
20
  accuracy_score,
27
21
  precision_score,
@@ -36,18 +30,12 @@ from sklearn.metrics import (
36
30
  precision_recall_curve,
37
31
  average_precision_score,
38
32
  )
39
- from imblearn.over_sampling import SMOTE
40
- from sklearn.pipeline import Pipeline
41
- from collections import defaultdict
42
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
43
33
  from typing import Dict, Any, Optional, List, Union
44
34
  import numpy as np
45
35
  import pandas as pd
46
36
  from . import ips
47
37
  from . import plot
48
38
  import matplotlib.pyplot as plt
49
- import seaborn as sns
50
-
51
39
  plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
52
40
  import logging
53
41
  import warnings
@@ -314,6 +302,8 @@ def features_svm(
314
302
  - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
315
303
  S-shaped relationships.
316
304
  """
305
+ from sklearn.feature_selection import RFE
306
+ from sklearn.svm import SVC
317
307
  # SVM (Support Vector Machines)
318
308
  svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
319
309
  # RFE(Recursive Feature Elimination)
@@ -450,6 +440,7 @@ def validate_classifier(
450
440
  Returns:
451
441
  - results: Dictionary containing average cv_train_scores and cv_test_scores.
452
442
  """
443
+ from sklearn.model_selection import cross_val_score
453
444
  cv_train_scores = {metric: [] for metric in metrics}
454
445
  skf = StratifiedKFold(n_splits=cv_folds)
455
446
  # Perform cross-validation
@@ -982,6 +973,8 @@ def validate_features(
982
973
 
983
974
  """
984
975
  from tqdm import tqdm
976
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
977
+ from sklearn.calibration import CalibratedClassifierCV
985
978
 
986
979
  # Ensure common features are selected
987
980
  common_features = ips.shared(
@@ -1001,6 +994,7 @@ def validate_features(
1001
994
 
1002
995
  # Handle class imbalance using SMOTE
1003
996
  if smote:
997
+ from imblearn.over_sampling import SMOTE
1004
998
  if (
1005
999
  y_train.value_counts(normalize=True).max() < 0.8
1006
1000
  ): # Threshold to decide if data is imbalanced
@@ -2096,7 +2090,116 @@ def rank_models(
2096
2090
  # )
2097
2091
 
2098
2092
  # figsave("classifier_performance.pdf")
2093
+ def rank_models_reg(df, ascending=False):
2094
+ """
2095
+ Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
2099
2096
 
2097
+ Parameters:
2098
+ df (pd.DataFrame): DataFrame containing the regression metrics.
2099
+ ascending (bool): Whether to sort in ascending order of ranking score.
2100
+
2101
+ Returns:
2102
+ pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
2103
+ """
2104
+ # Define weights for the 4 metrics
2105
+ weights = {
2106
+ "mse": -1, # Lower is better
2107
+ "rmse": -1, # Lower is better
2108
+ "mae": -1, # Lower is better
2109
+ "r2": 1, # Higher is better
2110
+ }
2111
+
2112
+ # Normalize the selected metrics
2113
+ df = df.copy() # Work on a copy of the DataFrame
2114
+ for metric, weight in weights.items():
2115
+ if metric in df.columns:
2116
+ if weight > 0: # Higher is better; normalize 0-1
2117
+ df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
2118
+ df[metric].max() - df[metric].min()
2119
+ )
2120
+ else: # Lower is better; reverse normalize 0-1
2121
+ df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
2122
+ df[metric].max() - df[metric].min()
2123
+ )
2124
+
2125
+ # Calculate ranking score as a weighted sum
2126
+ df["Ranking_Score"] = sum(
2127
+ df[metric + "_normalized"] * abs(weights[metric])
2128
+ for metric in weights.keys()
2129
+ if metric + "_normalized" in df.columns
2130
+ )
2131
+
2132
+ # Sort models based on the ranking score
2133
+ sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
2134
+ return sorted_df
2135
+
2136
+ models_support = {
2137
+ "classification": {
2138
+ "Random Forest": "Tree-Based",
2139
+ "SVM": "Kernel-Based",
2140
+ "Logistic Regression": "Linear",
2141
+ "Lasso Logistic Regression": "Linear",
2142
+ "Gradient Boosting": "Tree-Based",
2143
+ "XGBoost": "Tree-Based",
2144
+ "KNN": "Instance-Based",
2145
+ "Naive Bayes": "Probabilistic",
2146
+ "Linear Discriminant Analysis": "Linear",
2147
+ "AdaBoost": "Tree-Based",
2148
+ "CatBoost": "Tree-Based",
2149
+ "Extra Trees": "Tree-Based",
2150
+ "Bagging": "Tree-Based",
2151
+ "Neural Network": "Neural Network",
2152
+ "DecisionTree": "Tree-Based",
2153
+ "Quadratic Discriminant Analysis": "Probabilistic",
2154
+ "Ridge": "Linear",
2155
+ "Perceptron": "Linear",
2156
+ "Bernoulli Naive Bayes": "Probabilistic",
2157
+ "SGDClassifier": "Linear",
2158
+ },
2159
+ "regression": {
2160
+ "Linear Regression": "Linear",
2161
+ "Ridge": "Linear",
2162
+ "RidgeCV": "Linear",
2163
+ "TheilSenRegressor": "Linear",
2164
+ "HuberRegressor": "Linear",
2165
+ "PoissonRegressor": "Linear",
2166
+ "LassoCV": "Linear",
2167
+ "Bagging": "Tree-Based",
2168
+ "ElasticNet": "Linear",
2169
+ "Random Forest": "Tree-Based",
2170
+ "Gradient Boosting": "Tree-Based",
2171
+ "XGBoost": "Tree-Based",
2172
+ "CatBoost": "Tree-Based",
2173
+ "Extra Trees": "Tree-Based",
2174
+ "SVM": "Kernel-Based",
2175
+ "KNN": "Instance-Based",
2176
+ "Neural Network": "Neural Network",
2177
+ "AdaBoost": "Linear",
2178
+ },
2179
+ }
2180
+ def select_top_models(models, categories, n_top_models, n_models_per_category=1):
2181
+ """
2182
+ models = list_sort
2183
+ purpose = "regression"
2184
+ categories = models_support[purpose]
2185
+ n_top_models = 3
2186
+ select_top_models(models, categories, n_top_models)
2187
+ """
2188
+ selected = {}
2189
+ result = []
2190
+ for model in models:
2191
+ category = categories.get(model, "Unknown")
2192
+ if category not in selected:
2193
+ selected[category] = 0 # Initialize counter for the category
2194
+
2195
+ if selected[category] < n_models_per_category: # Allow additional models up to the limit
2196
+ selected[category] += 1
2197
+ result.append(model)
2198
+
2199
+ if len(result) == n_top_models: # Stop when the desired number of models is reached
2200
+ break
2201
+
2202
+ return result
2100
2203
 
2101
2204
  def predict(
2102
2205
  x_train: pd.DataFrame,
@@ -2104,11 +2207,17 @@ def predict(
2104
2207
  x_true: pd.DataFrame = None,
2105
2208
  y_true: Optional[pd.Series] = None,
2106
2209
  backward: bool = False, # backward_regression
2210
+ backward_thr:float = 0.05,# pval thr,only works when backward is True
2107
2211
  common_features: set = None,
2108
2212
  purpose: str = "classification", # 'classification' or 'regression'
2109
2213
  cls: Optional[Dict[str, Any]] = None,
2110
2214
  metrics: Optional[List[str]] = None,
2111
- random_state: int = 1,
2215
+ stack:bool=True,# run stacking
2216
+ stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2217
+ vote:bool=True,# run voting
2218
+ voting:str="hard", # only for classification purporse of voting
2219
+ n_top_models:int=5, #for stacking models
2220
+ n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
2112
2221
  smote: bool = False,
2113
2222
  n_jobs: int = -1,
2114
2223
  plot_: bool = True,
@@ -2117,6 +2226,7 @@ def predict(
2117
2226
  cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
2118
2227
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2119
2228
  class_weight: str = "balanced",
2229
+ random_state: int = 1,
2120
2230
  verbose: bool = False,
2121
2231
  ) -> pd.DataFrame:
2122
2232
  """
@@ -2184,10 +2294,17 @@ def predict(
2184
2294
  RidgeClassifierCV,
2185
2295
  Perceptron,
2186
2296
  SGDClassifier,
2297
+ RidgeCV,
2298
+ Ridge,
2299
+ TheilSenRegressor,
2300
+ HuberRegressor,
2301
+ PoissonRegressor,
2302
+
2187
2303
  )
2304
+ from sklearn.compose import TransformedTargetRegressor
2188
2305
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
2189
2306
  from sklearn.naive_bayes import GaussianNB, BernoulliNB
2190
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
2307
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
2191
2308
  import xgboost as xgb
2192
2309
  import lightgbm as lgb
2193
2310
  import catboost as cb
@@ -2198,6 +2315,7 @@ def predict(
2198
2315
  QuadraticDiscriminantAnalysis,
2199
2316
  )
2200
2317
  from sklearn.preprocessing import PolynomialFeatures
2318
+ from sklearn.model_selection import train_test_split
2201
2319
 
2202
2320
  # 拼写检查
2203
2321
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
@@ -2261,7 +2379,6 @@ def predict(
2261
2379
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2262
2380
  "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2263
2381
  "Linear Regression": LinearRegression(),
2264
- "Lasso": Lasso(random_state=random_state),
2265
2382
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2266
2383
  # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2267
2384
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
@@ -2271,10 +2388,10 @@ def predict(
2271
2388
  "ElasticNet": ElasticNet(random_state=random_state),
2272
2389
  "Ridge": Ridge(),
2273
2390
  "KNN": KNeighborsRegressor(),
2391
+ "TheilSen":TheilSenRegressor(),
2392
+ "Huber":HuberRegressor(),
2393
+ "Poisson":PoissonRegressor()
2274
2394
  }
2275
- # indicate cls:
2276
- if ips.run_once_within(30): # 10 min
2277
- print(f"supported models: {list(model_.keys())}")
2278
2395
  if cls is None:
2279
2396
  models = model_
2280
2397
  else:
@@ -2290,6 +2407,10 @@ def predict(
2290
2407
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2291
2408
  )
2292
2409
 
2410
+ # indicate cls:
2411
+ if ips.run_once_within(30): # 10 min
2412
+ print(f"processing: {list(models.keys())}")
2413
+
2293
2414
  if isinstance(y_train, str) and y_train in x_train.columns:
2294
2415
  y_train_col_name = y_train
2295
2416
  y_train = x_train[y_train]
@@ -2311,7 +2432,7 @@ def predict(
2311
2432
 
2312
2433
  # Perform backward feature selection
2313
2434
  if backward:
2314
- selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2435
+ selected_features = backward_regression(x_train, y_train, thr=backward_thr)
2315
2436
  x_train = x_train[selected_features]
2316
2437
 
2317
2438
  if x_true is None:
@@ -2391,10 +2512,22 @@ def predict(
2391
2512
  if isinstance(y_train, np.ndarray):
2392
2513
  y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
2393
2514
  y_train = np.asarray(y_train)
2394
- if isinstance(y_train, np.ndarray):
2395
- y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2396
- y_true = np.asarray(y_true)
2515
+ if y_true is not None:
2516
+ if isinstance(y_train, np.ndarray):
2517
+ y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2518
+ y_true = np.asarray(y_true)
2397
2519
  # Hyperparameter grids for tuning
2520
+ param_grid_common_xgb = {
2521
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2522
+ 'max_depth': [3, 5, 7, 10],
2523
+ 'n_estimators': [50, 100, 200, 300],
2524
+ 'subsample': [0.6, 0.8, 1.0],
2525
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2526
+ 'gamma': [0, 0.1, 0.2, 0.5],
2527
+ 'min_child_weight': [1, 5, 10],
2528
+ 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2529
+ 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2530
+ }
2398
2531
  if cv_level in ["low", "simple", "s", "l"]:
2399
2532
  param_grids = {
2400
2533
  "Random Forest": (
@@ -2416,8 +2549,8 @@ def predict(
2416
2549
  }
2417
2550
  ),
2418
2551
  "SVM": {
2419
- "C": [1],
2420
- "gamma": ["scale"],
2552
+ "C": [0.1, 1, 10],
2553
+ "gamma": ["scale", 0.1, 1],
2421
2554
  "kernel": ["rbf"],
2422
2555
  },
2423
2556
  "Lasso": {
@@ -2439,12 +2572,17 @@ def predict(
2439
2572
  "min_samples_split": [2],
2440
2573
  "subsample": [0.8],
2441
2574
  },
2442
- "XGBoost": {
2443
- "n_estimators": [100],
2444
- "max_depth": [3],
2445
- "learning_rate": [0.1],
2446
- "subsample": [0.8],
2447
- "colsample_bytree": [0.8],
2575
+ "XGBoost":{
2576
+ 'learning_rate': [0.01],
2577
+ 'max_depth': [3],
2578
+ 'n_estimators': [50],
2579
+ 'subsample': [0.6],
2580
+ 'colsample_bytree': [0.6],
2581
+ 'gamma': [0, 0.1],
2582
+ 'min_child_weight': [1],
2583
+ 'reg_alpha': [0, 0.1],
2584
+ 'reg_lambda': [1],
2585
+ 'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
2448
2586
  },
2449
2587
  "KNN": (
2450
2588
  {
@@ -2551,6 +2689,14 @@ def predict(
2551
2689
  "random_state": [random_state],
2552
2690
  "learning_rate": ["constant"],
2553
2691
  },
2692
+ "TheilSen":{'max_iter': [100],
2693
+ 'tol': [1e-4],
2694
+ 'n_subsamples': [100+x_train.shape[1]]},
2695
+ "Huber":{'epsilon': [1.35],
2696
+ 'alpha': [0.1],
2697
+ 'max_iter': [100],},
2698
+ "Poisson":{'alpha': [0.1],
2699
+ 'max_iter': [100],}
2554
2700
  }
2555
2701
  elif cv_level in ["high", "advanced", "h"]:
2556
2702
  param_grids = {
@@ -2612,12 +2758,30 @@ def predict(
2612
2758
  "subsample": [0.8, 1.0],
2613
2759
  },
2614
2760
  "XGBoost": {
2615
- "n_estimators": [100, 200, 500, 700],
2616
- "max_depth": [3, 5, 7, 10],
2617
- "learning_rate": [0.01, 0.1, 0.2, 0.3],
2618
- "subsample": [0.8, 1.0],
2619
- "colsample_bytree": [0.8, 0.9, 1.0],
2620
- },
2761
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2762
+ 'max_depth': [3, 5, 7, 10],
2763
+ 'n_estimators': [50, 100, 200, 300],
2764
+ 'subsample': [0.6, 0.8, 1.0],
2765
+ 'gamma': [0, 0.1, 0.2, 0.5],
2766
+ 'min_child_weight': [1, 5, 10],
2767
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2768
+ 'reg_lambda': [1, 1.5, 2],
2769
+ **{
2770
+ 'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
2771
+ }} if purpose== "classification"
2772
+ else{
2773
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2774
+ 'max_depth': [3, 5, 7, 10],
2775
+ 'n_estimators': [50, 100, 200, 300],
2776
+ 'subsample': [0.6, 0.8, 1.0],
2777
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2778
+ 'gamma': [0, 0.1, 0.2, 0.5],
2779
+ 'min_child_weight': [1, 5, 10],
2780
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2781
+ 'reg_lambda': [1, 1.5, 2],
2782
+ **{
2783
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
2784
+ }},
2621
2785
  "KNN": (
2622
2786
  {
2623
2787
  "n_neighbors": [1, 3, 5, 10, 15, 20],
@@ -2730,6 +2894,14 @@ def predict(
2730
2894
  ], # If True, the regressors X will be normalized
2731
2895
  }
2732
2896
  ),
2897
+ "TheilSen":{'max_iter': [100, 200, 300],
2898
+ 'tol': [1e-4, 1e-3, 1e-2],
2899
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
2900
+ "Huber":{'epsilon': [1.35, 1.5, 2.0],
2901
+ 'alpha': [0.1, 1.0, 10.0],
2902
+ 'max_iter': [100, 200, 300],},
2903
+ "Poisson":{'alpha': [0.1, 1.0, 10.0],
2904
+ 'max_iter': [100, 200, 300],}
2733
2905
  }
2734
2906
  else: # median level
2735
2907
  param_grids = {
@@ -2789,12 +2961,30 @@ def predict(
2789
2961
  "subsample": [0.8, 1.0],
2790
2962
  },
2791
2963
  "XGBoost": {
2792
- "n_estimators": [100, 200, 500],
2793
- "max_depth": [3, 5, 7],
2794
- "learning_rate": [0.01, 0.1, 0.2],
2795
- "subsample": [0.8, 1.0],
2796
- "colsample_bytree": [0.8, 1.0],
2797
- },
2964
+ 'learning_rate': [0.01, 0.1],
2965
+ 'max_depth': [3, 5],
2966
+ 'n_estimators': [50, 100],
2967
+ 'subsample': [0.6, 0.8],
2968
+ 'gamma': [0, 0.1],
2969
+ 'min_child_weight': [1, 5],
2970
+ 'reg_alpha': [0, 0.1],
2971
+ 'reg_lambda': [1,],
2972
+ **{
2973
+ 'objective': ['binary:logistic', 'multi:softmax'],
2974
+ }} if purpose== "classification"
2975
+ else{
2976
+ 'learning_rate': [0.01, 0.1],
2977
+ 'max_depth': [3, 5,],
2978
+ 'n_estimators': [50, 100],
2979
+ 'subsample': [0.6, 0.8],
2980
+ 'colsample_bytree': [0.6, 0.8],
2981
+ 'gamma': [0, 0.1],
2982
+ 'min_child_weight': [1, 5],
2983
+ 'reg_alpha': [0, 0.1],
2984
+ 'reg_lambda': [1, 1.5],
2985
+ **{
2986
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror'],
2987
+ }},
2798
2988
  "KNN": (
2799
2989
  {
2800
2990
  "n_neighbors": [3, 5, 7, 10],
@@ -2951,6 +3141,14 @@ def predict(
2951
3141
  ], # Solver for optimization
2952
3142
  }
2953
3143
  ),
3144
+ "TheilSen":{'max_iter': [100, 200],
3145
+ 'tol': [1e-4, 1e-3],
3146
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
3147
+ "Huber":{'epsilon': [1.35, 1.5],
3148
+ 'alpha': [0.1, 1.0],
3149
+ 'max_iter': [100, 200],},
3150
+ "Poisson":{'alpha': [0.1, 1.0],
3151
+ 'max_iter': [100, 200],}
2954
3152
  }
2955
3153
 
2956
3154
  results = {}
@@ -3191,12 +3389,18 @@ def predict(
3191
3389
  # Convert results to DataFrame
3192
3390
  df_results = pd.DataFrame.from_dict(results, orient="index")
3193
3391
  # sort
3194
- if y_true is not None and purpose == "classification":
3195
- df_scores = pd.DataFrame(
3196
- df_results["scores"].tolist(), index=df_results["scores"].index
3197
- ).sort_values(by="roc_auc", ascending=False)
3392
+ if y_true is not None:
3393
+ if purpose == "classification":
3394
+ df_scores = pd.DataFrame(
3395
+ df_results["scores"].tolist(), index=df_results["scores"].index
3396
+ ).sort_values(by="roc_auc", ascending=False)
3397
+ elif purpose=='regression':
3398
+ df_scores = rank_models_reg(
3399
+ pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
3400
+ ascending=False)
3198
3401
  df_results = df_results.loc[df_scores.index]
3199
3402
 
3403
+ if y_true is not None and purpose == "classification":
3200
3404
  if plot_:
3201
3405
  from datetime import datetime
3202
3406
 
@@ -3214,18 +3418,503 @@ def predict(
3214
3418
  plot.figsets(xangle=30)
3215
3419
  if dir_save:
3216
3420
  ips.figsave(dir_save + f"scores_clus{now_}.pdf")
3421
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3422
+ # # try:
3423
+ # if len(models) > 3:
3424
+ # plot_validate_features(df_results, is_binary=is_binary)
3425
+ # else:
3426
+ # plot_validate_features_single(df_results, is_binary=is_binary)
3427
+ # if dir_save:
3428
+ # ips.figsave(dir_save + f"validate_features{now_}.pdf")
3429
+ # # except Exception as e:
3430
+ # # print(f"Error: 在画图的过程中出现了问题:{e}")
3431
+ if stack:
3432
+ #! stacking classifier/regressor
3433
+ from sklearn.metrics import make_scorer, accuracy_score
3434
+ from sklearn.model_selection import cross_val_score
3435
+
3436
+ #* n_top_models防止超过index
3437
+ n_top_models = min(n_top_models, df_results.shape[0])
3438
+
3439
+ #* 选择出排名靠前的n个, estimators
3440
+ models_selecte = select_top_models(models=list(df_results.index),
3441
+ categories=models_support[purpose],
3442
+ n_top_models=n_top_models,
3443
+ n_models_per_category=n_models_per_category)
3444
+ top_models = df_results.loc[models_selecte]["best_clf"]
3445
+ base_estimators = []
3446
+ for i, j in top_models.to_dict().items():
3447
+ base_estimators.append((i, j))
3448
+ if stacking_cv:
3449
+ print(f" ⤵ stacking_cv is processing...")
3450
+ #* 定义几个象征性的final_estimator
3451
+ # 备选的几种
3452
+ if purpose == "classification":
3453
+ kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
3454
+ else:
3455
+ kadt_estimators=["XGBoost","LassoCV"]
3456
+ final_estimators={}
3457
+ for name in kadt_estimators:
3458
+ param_grid=param_grids.get(name, {})
3459
+ print(param_grid)
3460
+ if is_binary:
3461
+ gs = GridSearchCV(
3462
+ model_[name],
3463
+ param_grid=param_grid,
3464
+ scoring=(
3465
+ "roc_auc"
3466
+ if purpose == "classification"
3467
+ else "neg_mean_squared_error"
3468
+ ),
3469
+ cv=cv,
3470
+ n_jobs=n_jobs,
3471
+ verbose=verbose,
3472
+ )
3473
+ else:
3474
+ gs = GridSearchCV(
3475
+ model_[name],
3476
+ param_grid=param_grid,
3477
+ scoring=(
3478
+ "roc_auc_ovr"
3479
+ if purpose == "classification"
3480
+ else "neg_mean_squared_error"
3481
+ ),
3482
+ cv=cv,
3483
+ n_jobs=n_jobs,
3484
+ verbose=verbose,
3485
+ )
3486
+ # Fit GridSearchCV
3487
+ gs.fit(x_train, y_train)
3488
+ final_estimators[name]=gs.best_estimator_
3489
+
3490
+ #* Set up cross-validation and performance evaluation
3491
+ scorer = make_scorer(accuracy_score)
3492
+ cv_results = []
3493
+
3494
+ #*Cross-validate stacking models with different final estimators
3495
+ for final_name, final_estimator in final_estimators.items():
3496
+ print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
3497
+ if purpose == "classification":
3498
+ stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
3499
+ else:
3500
+ stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
3501
+
3502
+ scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
3503
+
3504
+ # Store the result
3505
+ cv_results.append({
3506
+ 'final_estimator':final_estimator,
3507
+ 'Final Estimator': final_name,
3508
+ 'Mean Accuracy': np.mean(scores),
3509
+ 'Standard Deviation': np.std(scores)
3510
+ })
3511
+
3512
+ #* Convert the results into a DataFrame for easy comparison
3513
+ cv_results_df = pd.DataFrame(cv_results)
3514
+
3515
+ #* Sort and display the best model
3516
+ cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
3517
+
3518
+
3519
+ # Optionally: Select the final estimator that gives the best performance
3520
+ best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3521
+ print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3522
+ else:
3523
+ print(f" ⤵ trying to find the best_final_estimator for stacking...")
3524
+ if purpose=="classification":
3525
+ best_final_estimator = LogisticRegression(class_weight=class_weight,
3526
+ random_state=random_state,
3527
+ max_iter=1000)
3528
+ else:
3529
+ best_final_estimator = RidgeCV(cv=5)
3530
+ print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3531
+ #! apply stacking
3532
+ if purpose == "classification":
3533
+ print(f" ⤵ StackingClassifier...")
3534
+ stacking_model = StackingClassifier(estimators=base_estimators,
3535
+ final_estimator=best_final_estimator,
3536
+ cv=cv)
3537
+ else:
3538
+ print(f" ⤵ StackingRegressor...")
3539
+ stacking_model = StackingRegressor(estimators=base_estimators,
3540
+ final_estimator=best_final_estimator,
3541
+ cv=cv)
3542
+
3543
+ # Train the Stacking Classifier
3544
+ print(f" ⤵ fit & predict...")
3545
+ stacking_model.fit(x_train, y_train)
3546
+ y_pred_final = stacking_model.predict(x_true)
3547
+ print(f" ⤵ collecting results...")
3548
+ # pred_proba
3549
+ if is_binary:
3550
+ if hasattr(stacking_model, "predict_proba"):
3551
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3552
+ print("Shape of predicted probabilities:", y_pred_proba_final.shape)
3553
+ if y_pred_proba_final.shape[1] == 1:
3554
+ y_pred_proba_final = np.hstack(
3555
+ [1 - y_pred_proba_final, y_pred_proba_final]
3556
+ ) # Add missing class probabilities
3557
+ y_pred_proba_final = y_pred_proba_final[:, 1]
3558
+ elif hasattr(stacking_model, "decision_function"):
3559
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3560
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3561
+ # Ensure y_pred_proba_final is within 0 and 1 bounds
3562
+ y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
3563
+ y_pred_proba_final.max() - y_pred_proba_final.min()
3564
+ )
3565
+ else:
3566
+ y_pred_proba_final = None # No probability output for certain models
3567
+ if not is_binary:
3568
+ # Handle prediction probabilities for multiclass
3569
+ if hasattr(stacking_model, "predict_proba"):
3570
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3571
+ elif hasattr(stacking_model, "decision_function"):
3572
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3573
+
3574
+ # Normalize for multiclass if necessary
3575
+ if y_pred_proba_final.ndim == 2:
3576
+ y_pred_proba_final = (
3577
+ y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
3578
+ ) / (
3579
+ y_pred_proba_final.max(axis=1, keepdims=True)
3580
+ - y_pred_proba_final.min(axis=1, keepdims=True)
3581
+ )
3582
+ else:
3583
+ y_pred_proba_final = None # No probability output for certain models
3584
+ #! dict_pred_stack
3585
+ dict_pred_stack={}
3586
+ validation_scores_final = {}
3587
+ if y_true is not None and y_pred_proba_final is not None:
3588
+ validation_scores_final = cal_metrics(
3589
+ y_true,
3590
+ y_pred_final,
3591
+ y_pred_proba=y_pred_proba_final,
3592
+ is_binary=is_binary,
3593
+ purpose=purpose,
3594
+ average="weighted",
3595
+ )
3596
+ if is_binary:
3597
+ # Calculate ROC curve
3598
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
3599
+ if y_pred_proba_final is not None:
3600
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3601
+ lower_ci, upper_ci = cal_auc_ci(
3602
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3603
+ )
3604
+ roc_auc = auc(fpr, tpr)
3605
+ roc_info = {
3606
+ "fpr": fpr.tolist(),
3607
+ "tpr": tpr.tolist(),
3608
+ "auc": roc_auc,
3609
+ "ci95": (lower_ci, upper_ci),
3610
+ }
3611
+ # precision-recall curve
3612
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
3613
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
3614
+ pr_info = {
3615
+ "precision": precision_,
3616
+ "recall": recall_,
3617
+ "avg_precision": avg_precision_,
3618
+ }
3619
+ else:
3620
+ roc_info, pr_info = None, None
3621
+ if purpose == "classification":
3622
+ dict_pred_stack = {
3623
+ "best_clf": stacking_model,
3624
+ "best_params": None,
3625
+ "auc_indiv": None,
3626
+ "scores": validation_scores_final,
3627
+ "roc_curve": roc_info,
3628
+ "pr_curve": pr_info,
3629
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3630
+ "predictions": y_pred_final.tolist(),
3631
+ "predictions_proba": (
3632
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3633
+ ),
3634
+ }
3635
+ else: # "regression"
3636
+ dict_pred_stack = {
3637
+ "best_clf": stacking_model,
3638
+ "best_params": None,
3639
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3640
+ "predictions": y_pred_final.tolist(),
3641
+ "predictions_proba": (
3642
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3643
+ ),
3644
+ }
3645
+ else: # multi-classes
3646
+ if y_pred_proba_final is not None:
3647
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
3648
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3649
+ confidence_intervals = cal_auc_ci(
3650
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3651
+ )
3652
+ roc_info = {
3653
+ "fpr": validation_scores_final["fpr"],
3654
+ "tpr": validation_scores_final["tpr"],
3655
+ "auc": validation_scores_final["roc_auc_by_class"],
3656
+ "ci95": confidence_intervals,
3657
+ }
3658
+ # precision-recall curve
3659
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3660
+ y_true, y_pred_proba_final, is_binary=is_binary
3661
+ )
3662
+ pr_info = {
3663
+ "precision": precision_,
3664
+ "recall": recall_,
3665
+ "avg_precision": avg_precision_,
3666
+ }
3667
+ else:
3668
+ roc_info, pr_info = None, None
3669
+
3670
+ if purpose == "classification":
3671
+ dict_pred_stack = {
3672
+ "best_clf": stacking_model,
3673
+ "best_params": None,
3674
+ "auc_indiv": None,
3675
+ "scores": validation_scores_final,
3676
+ "roc_curve": roc_info,
3677
+ "pr_curve": pr_info,
3678
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3679
+ "predictions": y_pred_final.tolist(),
3680
+ "predictions_proba": (
3681
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3682
+ ),
3683
+ }
3684
+ else: # "regression"
3685
+ dict_pred_stack = {
3686
+ "best_clf": stacking_model,
3687
+ "best_params": None,
3688
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3689
+ "predictions": y_pred_final.tolist(),
3690
+ "predictions_proba": (
3691
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3692
+ ),
3693
+ }
3694
+
3695
+ else:
3696
+ if y_true is None:
3697
+ validation_scores_final = []
3698
+ else:
3699
+ validation_scores_final = cal_metrics(
3700
+ y_true,
3701
+ y_pred,
3702
+ y_pred_proba=y_pred_proba_final,
3703
+ is_binary=is_binary,
3704
+ purpose=purpose,
3705
+ average="weighted",
3706
+ )
3707
+ dict_pred_stack = {
3708
+ "best_clf": stacking_model,
3709
+ "best_params": None,
3710
+ "scores": validation_scores_final,
3711
+ "predictions": y_pred_final.tolist(),
3712
+ "predictions_proba": (
3713
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3714
+ ),
3715
+ "y_train": y_train if y_train is not None else [],
3716
+ "y_true": y_true if y_true is not None else [],
3717
+ }
3718
+ # merge together
3719
+ df_pred = pd.DataFrame(
3720
+ [None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
3721
+ for k, v in dict_pred_stack.items():
3722
+ if k in df_pred.columns:
3723
+ df_pred[k] = [v]
3724
+
3725
+ # # plot the stacking
3726
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3727
+ # plot_validate_features_single(df_pred, is_binary=is_binary)
3728
+ # if dir_save:
3729
+ # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3730
+ if vote:
3731
+ print(f" ⤵ voting...")
3732
+ from sklearn.ensemble import VotingClassifier, VotingRegressor
3733
+ #! Votting
3734
+ n_top_models = min(n_top_models, df_results.shape[0])
3735
+ base_estimators=[]
3736
+ for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3737
+ base_estimators.append((name,cls))
3738
+ # Apply Voting Classifier/Regressor
3739
+ if purpose == "classification":
3740
+ print(f" ⤵ VotingClassifier...via{votting}")
3741
+ if voting=='hard':
3742
+ # Hard voting does not support `predict_proba`
3743
+ voting_model = VotingClassifier(estimators=base_estimators)
3744
+ else:
3745
+ # Soft voting supports `predict_proba`
3746
+ voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3747
+ else:
3748
+ print(f" ⤵ VotingRegressor...")
3749
+ voting_model = VotingRegressor(estimators=base_estimators)
3750
+
3751
+ # Train the Voting Classifier/Regressor
3752
+ try:
3753
+ voting_model.fit(x_train, y_train)
3754
+ y_pred_vote = voting_model.predict(x_true)
3755
+ except Exception as e:
3756
+ if purpose == "classification" and not voting=='hard':
3757
+ voting_model = VotingClassifier(estimators=base_estimators)
3758
+ voting_model.fit(x_train, y_train)
3759
+ y_pred_vote = voting_model.predict(x_true)
3760
+
3761
+ # Calculate predicted probabilities if applicable
3762
+ if purpose == "classification":
3763
+ if hasattr(voting_model, "predict_proba"):
3764
+ y_pred_proba_vote = voting_model.predict_proba(x_true)
3765
+ print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
3766
+ if y_pred_proba_vote.shape[1] == 1:
3767
+ y_pred_proba_vote = np.hstack(
3768
+ [1 - y_pred_proba_vote, y_pred_proba_vote]
3769
+ ) # Add missing class probabilities
3770
+ y_pred_proba_vote = y_pred_proba_vote[:, 1]
3771
+ else:
3772
+ y_pred_proba_vote = None
3773
+ else: # Regression
3774
+ y_pred_proba_vote = None
3775
+
3776
+ print(f" ⤵ collecting voting results...")
3777
+ #! dict_pred_vote
3778
+ dict_pred_vote = {}
3779
+ validation_scores_vote = {}
3780
+ if y_true is not None and y_pred_proba_vote is not None:
3781
+ validation_scores_vote = cal_metrics(
3782
+ y_true,
3783
+ y_pred_vote,
3784
+ y_pred_proba=y_pred_proba_vote,
3785
+ is_binary=is_binary,
3786
+ purpose=purpose,
3787
+ average="weighted",
3788
+ )
3789
+
3790
+ if is_binary:
3791
+ if y_pred_proba_vote is not None:
3792
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
3793
+ lower_ci, upper_ci = cal_auc_ci(
3794
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3795
+ )
3796
+ roc_auc = auc(fpr, tpr)
3797
+ roc_info = {
3798
+ "fpr": fpr.tolist(),
3799
+ "tpr": tpr.tolist(),
3800
+ "auc": roc_auc,
3801
+ "ci95": (lower_ci, upper_ci),
3802
+ }
3803
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
3804
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
3805
+ pr_info = {
3806
+ "precision": precision_,
3807
+ "recall": recall_,
3808
+ "avg_precision": avg_precision_,
3809
+ }
3810
+ else:
3811
+ roc_info, pr_info = None, None
3812
+
3813
+ dict_pred_vote = {
3814
+ "best_clf": voting_model,
3815
+ "best_params": None,
3816
+ "auc_indiv": None,
3817
+ "scores": validation_scores_vote,
3818
+ "roc_curve": roc_info,
3819
+ "pr_curve": pr_info,
3820
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3821
+ "predictions": y_pred_vote.tolist(),
3822
+ "predictions_proba": (
3823
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3824
+ ),
3825
+ }
3826
+ else: # Multi-class
3827
+ if y_pred_proba_vote is not None:
3828
+ confidence_intervals = cal_auc_ci(
3829
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3830
+ )
3831
+ roc_info = {
3832
+ "fpr": validation_scores_vote["fpr"],
3833
+ "tpr": validation_scores_vote["tpr"],
3834
+ "auc": validation_scores_vote["roc_auc_by_class"],
3835
+ "ci95": confidence_intervals,
3836
+ }
3837
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3838
+ y_true, y_pred_proba_vote, is_binary=is_binary
3839
+ )
3840
+ pr_info = {
3841
+ "precision": precision_,
3842
+ "recall": recall_,
3843
+ "avg_precision": avg_precision_,
3844
+ }
3845
+ else:
3846
+ roc_info, pr_info = None, None
3847
+
3848
+ dict_pred_vote = {
3849
+ "best_clf": voting_model,
3850
+ "best_params": None,
3851
+ "scores": validation_scores_vote,
3852
+ "roc_curve": roc_info,
3853
+ "pr_curve": pr_info,
3854
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3855
+ "predictions": y_pred_vote.tolist(),
3856
+ "predictions_proba": (
3857
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3858
+ ),
3859
+ }
3860
+ else:
3861
+ if y_true is None:
3862
+ validation_scores_vote = []
3863
+ else:
3864
+ validation_scores_vote = cal_metrics(
3865
+ y_true,
3866
+ y_pred,
3867
+ y_pred_proba=y_pred_proba_vote,
3868
+ is_binary=is_binary,
3869
+ purpose=purpose,
3870
+ average="weighted",
3871
+ )
3872
+ dict_pred_vote = {
3873
+ "best_clf": voting_model,
3874
+ "best_params": None,
3875
+ "scores": validation_scores_vote,
3876
+ "predictions": y_pred_vote.tolist(),
3877
+ "predictions_proba": (
3878
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3879
+ ),
3880
+ "y_train": y_train if y_train is not None else [],
3881
+ "y_true": y_true if y_true is not None else [],
3882
+ }
3883
+ df_vote = pd.DataFrame(
3884
+ [None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
3885
+ for k, v in dict_pred_vote.items():
3886
+ if k in df_vote.columns:
3887
+ df_vote[k] = [v]
3888
+
3889
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3890
+ # try:
3891
+ # plot_validate_features_single(df_vote, is_binary=is_binary)
3892
+ # if dir_save:
3893
+ # ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
3894
+ # except Exception as e:
3895
+ # print(e)
3896
+ print("Done")
3897
+ if vote and stack:
3898
+ df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
3899
+ elif vote:
3900
+ df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
3901
+ elif stack:
3902
+ df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
3903
+
3217
3904
  if all([plot_, y_true is not None, purpose == "classification"]):
3905
+ from datetime import datetime
3906
+
3907
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3218
3908
  # try:
3219
- if len(models) > 3:
3220
- plot_validate_features(df_results, is_binary=is_binary)
3909
+ if df_res.shape[0] > 3:
3910
+ plot_validate_features(df_res, is_binary=is_binary)
3221
3911
  else:
3222
- plot_validate_features_single(df_results, is_binary=is_binary)
3912
+ plot_validate_features_single(df_res, is_binary=is_binary)
3223
3913
  if dir_save:
3224
3914
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3225
- # except Exception as e:
3226
- # print(f"Error: 在画图的过程中出现了问题:{e}")
3227
- return df_results
3228
-
3915
+ # except Exception as e:
3916
+ # print(f"Error: 在画图的过程中出现了问题:{e}")
3917
+ return df_res
3229
3918
 
3230
3919
  def cal_metrics(
3231
3920
  y_true,
@@ -3367,7 +4056,7 @@ def cal_metrics(
3367
4056
 
3368
4057
 
3369
4058
  def plot_trees(
3370
- X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
4059
+ X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3371
4060
  ):
3372
4061
  """
3373
4062
  # # Example usage:
@@ -3413,10 +4102,14 @@ def plot_trees(
3413
4102
  train_error_rate = []
3414
4103
  test_error_rate = []
3415
4104
  validation_error = None
3416
-
4105
+ if isinstance(cls, str):
4106
+ cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
3417
4107
  # Configure classifier based on type
3418
4108
  oob_enabled = False # Default to no OOB error unless explicitly set
3419
-
4109
+ clf_support = {"RandomForestClassifier":RandomForestClassifier(),
4110
+ "ExtraTreesClassifier":ExtraTreesClassifier(),
4111
+ "AdaBoostClassifier":AdaBoostClassifier(),
4112
+ "GradientBoostingClassifier":GradientBoostingClassifier()}
3420
4113
  if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3421
4114
  # Enable OOB if cls supports it and is using bootstrapping
3422
4115
  cls.set_params(warm_start=True, n_estimators=1)
@@ -3678,7 +4371,7 @@ def img_datasets_preprocessing(
3678
4371
 
3679
4372
 
3680
4373
  def backward_regression(
3681
- X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
4374
+ X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
3682
4375
  ):
3683
4376
  """
3684
4377
  # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
@@ -3690,31 +4383,46 @@ def backward_regression(
3690
4383
  X -- features values
3691
4384
  y -- target variable
3692
4385
  initial_list -- features header
3693
- threshold_out -- pvalue threshold of features to drop
4386
+ thr -- pvalue threshold of features to drop
3694
4387
  verbose -- true to produce lots of logging output
3695
4388
 
3696
4389
  Returns:
3697
4390
  list of selected features for modeling
3698
4391
  """
3699
4392
  import statsmodels.api as sm
3700
-
3701
- if isinstance(y, str) and y in X.columns:
3702
- y_col_name = y
3703
- y = X[y]
3704
- X = X.drop(y_col_name, axis=1)
4393
+ if isinstance(y, str):
4394
+ if y in X.columns:
4395
+ y_col_name = y
4396
+ y = X[y]
4397
+ X = X.drop(y_col_name, axis=1)
4398
+ else:
4399
+ raise ValueError(f"找不到{y},y设置有误")
4400
+ X = X.select_dtypes(include=[np.number])
4401
+
3705
4402
  included = list(X.columns)
4403
+ try:
4404
+ X=X.astype(float)
4405
+ y=y.astype(float)
4406
+ except Exception as e:
4407
+ raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
4408
+
4409
+
3706
4410
  while True:
3707
4411
  changed = False
4412
+ if not included:
4413
+ print("No features remain in the model.")
4414
+ break
4415
+
3708
4416
  model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3709
4417
  # exclude the intercept for p-value checking
3710
4418
  pvalues = model.pvalues.iloc[1:]
3711
4419
  worst_pval = pvalues.max()
3712
- if worst_pval > threshold_out:
4420
+ if worst_pval > thr:
3713
4421
  changed = True
3714
4422
  worst_feature = pvalues.idxmax()
3715
4423
  included.remove(worst_feature)
3716
4424
  if verbose:
3717
- print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
4425
+ print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
3718
4426
  if not changed:
3719
4427
  break
3720
4428
  print(f"\nSelected Features:\n{included}")