py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
5
5
  BaggingClassifier,
6
6
  )
7
7
  from sklearn.svm import SVC, SVR
8
- from sklearn.calibration import CalibratedClassifierCV
9
8
  from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
9
  from sklearn.linear_model import (
11
10
  LassoCV,
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
16
15
  RidgeClassifierCV,
17
16
  ElasticNet,
18
17
  )
19
- from sklearn.feature_selection import RFE
20
- from sklearn.naive_bayes import GaussianNB
21
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
22
- import xgboost as xgb # Make sure you have xgboost installed
23
-
24
- from sklearn.model_selection import train_test_split, cross_val_score
18
+
25
19
  from sklearn.metrics import (
26
20
  accuracy_score,
27
21
  precision_score,
@@ -36,18 +30,12 @@ from sklearn.metrics import (
36
30
  precision_recall_curve,
37
31
  average_precision_score,
38
32
  )
39
- from imblearn.over_sampling import SMOTE
40
- from sklearn.pipeline import Pipeline
41
- from collections import defaultdict
42
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
43
33
  from typing import Dict, Any, Optional, List, Union
44
34
  import numpy as np
45
35
  import pandas as pd
46
36
  from . import ips
47
37
  from . import plot
48
38
  import matplotlib.pyplot as plt
49
- import seaborn as sns
50
-
51
39
  plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
52
40
  import logging
53
41
  import warnings
@@ -314,6 +302,8 @@ def features_svm(
314
302
  - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
315
303
  S-shaped relationships.
316
304
  """
305
+ from sklearn.feature_selection import RFE
306
+ from sklearn.svm import SVC
317
307
  # SVM (Support Vector Machines)
318
308
  svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
319
309
  # RFE(Recursive Feature Elimination)
@@ -450,6 +440,7 @@ def validate_classifier(
450
440
  Returns:
451
441
  - results: Dictionary containing average cv_train_scores and cv_test_scores.
452
442
  """
443
+ from sklearn.model_selection import cross_val_score
453
444
  cv_train_scores = {metric: [] for metric in metrics}
454
445
  skf = StratifiedKFold(n_splits=cv_folds)
455
446
  # Perform cross-validation
@@ -982,6 +973,8 @@ def validate_features(
982
973
 
983
974
  """
984
975
  from tqdm import tqdm
976
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
977
+ from sklearn.calibration import CalibratedClassifierCV
985
978
 
986
979
  # Ensure common features are selected
987
980
  common_features = ips.shared(
@@ -1001,6 +994,7 @@ def validate_features(
1001
994
 
1002
995
  # Handle class imbalance using SMOTE
1003
996
  if smote:
997
+ from imblearn.over_sampling import SMOTE
1004
998
  if (
1005
999
  y_train.value_counts(normalize=True).max() < 0.8
1006
1000
  ): # Threshold to decide if data is imbalanced
@@ -2096,7 +2090,116 @@ def rank_models(
2096
2090
  # )
2097
2091
 
2098
2092
  # figsave("classifier_performance.pdf")
2093
+ def rank_models_reg(df, ascending=False):
2094
+ """
2095
+ Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
2099
2096
 
2097
+ Parameters:
2098
+ df (pd.DataFrame): DataFrame containing the regression metrics.
2099
+ ascending (bool): Whether to sort in ascending order of ranking score.
2100
+
2101
+ Returns:
2102
+ pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
2103
+ """
2104
+ # Define weights for the 4 metrics
2105
+ weights = {
2106
+ "mse": -1, # Lower is better
2107
+ "rmse": -1, # Lower is better
2108
+ "mae": -1, # Lower is better
2109
+ "r2": 1, # Higher is better
2110
+ }
2111
+
2112
+ # Normalize the selected metrics
2113
+ df = df.copy() # Work on a copy of the DataFrame
2114
+ for metric, weight in weights.items():
2115
+ if metric in df.columns:
2116
+ if weight > 0: # Higher is better; normalize 0-1
2117
+ df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
2118
+ df[metric].max() - df[metric].min()
2119
+ )
2120
+ else: # Lower is better; reverse normalize 0-1
2121
+ df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
2122
+ df[metric].max() - df[metric].min()
2123
+ )
2124
+
2125
+ # Calculate ranking score as a weighted sum
2126
+ df["Ranking_Score"] = sum(
2127
+ df[metric + "_normalized"] * abs(weights[metric])
2128
+ for metric in weights.keys()
2129
+ if metric + "_normalized" in df.columns
2130
+ )
2131
+
2132
+ # Sort models based on the ranking score
2133
+ sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
2134
+ return sorted_df
2135
+
2136
+ models_support = {
2137
+ "classification": {
2138
+ "Random Forest": "Tree-Based",
2139
+ "SVM": "Kernel-Based",
2140
+ "Logistic Regression": "Linear",
2141
+ "Lasso Logistic Regression": "Linear",
2142
+ "Gradient Boosting": "Tree-Based",
2143
+ "XGBoost": "Tree-Based",
2144
+ "KNN": "Instance-Based",
2145
+ "Naive Bayes": "Probabilistic",
2146
+ "Linear Discriminant Analysis": "Linear",
2147
+ "AdaBoost": "Tree-Based",
2148
+ "CatBoost": "Tree-Based",
2149
+ "Extra Trees": "Tree-Based",
2150
+ "Bagging": "Tree-Based",
2151
+ "Neural Network": "Neural Network",
2152
+ "DecisionTree": "Tree-Based",
2153
+ "Quadratic Discriminant Analysis": "Probabilistic",
2154
+ "Ridge": "Linear",
2155
+ "Perceptron": "Linear",
2156
+ "Bernoulli Naive Bayes": "Probabilistic",
2157
+ "SGDClassifier": "Linear",
2158
+ },
2159
+ "regression": {
2160
+ "Linear Regression": "Linear",
2161
+ "Ridge": "Linear",
2162
+ "RidgeCV": "Linear",
2163
+ "TheilSenRegressor": "Linear",
2164
+ "HuberRegressor": "Linear",
2165
+ "PoissonRegressor": "Linear",
2166
+ "LassoCV": "Linear",
2167
+ "Bagging": "Tree-Based",
2168
+ "ElasticNet": "Linear",
2169
+ "Random Forest": "Tree-Based",
2170
+ "Gradient Boosting": "Tree-Based",
2171
+ "XGBoost": "Tree-Based",
2172
+ "CatBoost": "Tree-Based",
2173
+ "Extra Trees": "Tree-Based",
2174
+ "SVM": "Kernel-Based",
2175
+ "KNN": "Instance-Based",
2176
+ "Neural Network": "Neural Network",
2177
+ "AdaBoost": "Linear",
2178
+ },
2179
+ }
2180
+ def select_top_models(models, categories, n_top_models, n_models_per_category=1):
2181
+ """
2182
+ models = list_sort
2183
+ purpose = "regression"
2184
+ categories = models_support[purpose]
2185
+ n_top_models = 3
2186
+ select_top_models(models, categories, n_top_models)
2187
+ """
2188
+ selected = {}
2189
+ result = []
2190
+ for model in models:
2191
+ category = categories.get(model, "Unknown")
2192
+ if category not in selected:
2193
+ selected[category] = 0 # Initialize counter for the category
2194
+
2195
+ if selected[category] < n_models_per_category: # Allow additional models up to the limit
2196
+ selected[category] += 1
2197
+ result.append(model)
2198
+
2199
+ if len(result) == n_top_models: # Stop when the desired number of models is reached
2200
+ break
2201
+
2202
+ return result
2100
2203
 
2101
2204
  def predict(
2102
2205
  x_train: pd.DataFrame,
@@ -2104,11 +2207,17 @@ def predict(
2104
2207
  x_true: pd.DataFrame = None,
2105
2208
  y_true: Optional[pd.Series] = None,
2106
2209
  backward: bool = False, # backward_regression
2210
+ backward_thr:float = 0.05,# pval thr,only works when backward is True
2107
2211
  common_features: set = None,
2108
2212
  purpose: str = "classification", # 'classification' or 'regression'
2109
2213
  cls: Optional[Dict[str, Any]] = None,
2110
2214
  metrics: Optional[List[str]] = None,
2111
- random_state: int = 1,
2215
+ stack:bool=True,# run stacking
2216
+ stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2217
+ vote:bool=True,# run voting
2218
+ voting:str="hard", # only for classification purporse of voting
2219
+ n_top_models:int=5, #for stacking models
2220
+ n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
2112
2221
  smote: bool = False,
2113
2222
  n_jobs: int = -1,
2114
2223
  plot_: bool = True,
@@ -2117,6 +2226,7 @@ def predict(
2117
2226
  cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
2118
2227
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2119
2228
  class_weight: str = "balanced",
2229
+ random_state: int = 1,
2120
2230
  verbose: bool = False,
2121
2231
  ) -> pd.DataFrame:
2122
2232
  """
@@ -2184,10 +2294,17 @@ def predict(
2184
2294
  RidgeClassifierCV,
2185
2295
  Perceptron,
2186
2296
  SGDClassifier,
2297
+ RidgeCV,
2298
+ Ridge,
2299
+ TheilSenRegressor,
2300
+ HuberRegressor,
2301
+ PoissonRegressor,
2302
+
2187
2303
  )
2304
+ from sklearn.compose import TransformedTargetRegressor
2188
2305
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
2189
2306
  from sklearn.naive_bayes import GaussianNB, BernoulliNB
2190
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
2307
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
2191
2308
  import xgboost as xgb
2192
2309
  import lightgbm as lgb
2193
2310
  import catboost as cb
@@ -2198,6 +2315,7 @@ def predict(
2198
2315
  QuadraticDiscriminantAnalysis,
2199
2316
  )
2200
2317
  from sklearn.preprocessing import PolynomialFeatures
2318
+ from sklearn.model_selection import train_test_split
2201
2319
 
2202
2320
  # 拼写检查
2203
2321
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
@@ -2261,7 +2379,6 @@ def predict(
2261
2379
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2262
2380
  "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2263
2381
  "Linear Regression": LinearRegression(),
2264
- "Lasso": Lasso(random_state=random_state),
2265
2382
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2266
2383
  # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2267
2384
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
@@ -2271,10 +2388,10 @@ def predict(
2271
2388
  "ElasticNet": ElasticNet(random_state=random_state),
2272
2389
  "Ridge": Ridge(),
2273
2390
  "KNN": KNeighborsRegressor(),
2391
+ "TheilSen":TheilSenRegressor(),
2392
+ "Huber":HuberRegressor(),
2393
+ "Poisson":PoissonRegressor()
2274
2394
  }
2275
- # indicate cls:
2276
- if ips.run_once_within(30): # 10 min
2277
- print(f"supported models: {list(model_.keys())}")
2278
2395
  if cls is None:
2279
2396
  models = model_
2280
2397
  else:
@@ -2290,6 +2407,10 @@ def predict(
2290
2407
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2291
2408
  )
2292
2409
 
2410
+ # indicate cls:
2411
+ if ips.run_once_within(30): # 10 min
2412
+ print(f"processing: {list(models.keys())}")
2413
+
2293
2414
  if isinstance(y_train, str) and y_train in x_train.columns:
2294
2415
  y_train_col_name = y_train
2295
2416
  y_train = x_train[y_train]
@@ -2311,7 +2432,7 @@ def predict(
2311
2432
 
2312
2433
  # Perform backward feature selection
2313
2434
  if backward:
2314
- selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2435
+ selected_features = backward_regression(x_train, y_train, thr=backward_thr)
2315
2436
  x_train = x_train[selected_features]
2316
2437
 
2317
2438
  if x_true is None:
@@ -2391,10 +2512,22 @@ def predict(
2391
2512
  if isinstance(y_train, np.ndarray):
2392
2513
  y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
2393
2514
  y_train = np.asarray(y_train)
2394
- if isinstance(y_train, np.ndarray):
2395
- y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2396
- y_true = np.asarray(y_true)
2515
+ if y_true is not None:
2516
+ if isinstance(y_train, np.ndarray):
2517
+ y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2518
+ y_true = np.asarray(y_true)
2397
2519
  # Hyperparameter grids for tuning
2520
+ param_grid_common_xgb = {
2521
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2522
+ 'max_depth': [3, 5, 7, 10],
2523
+ 'n_estimators': [50, 100, 200, 300],
2524
+ 'subsample': [0.6, 0.8, 1.0],
2525
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2526
+ 'gamma': [0, 0.1, 0.2, 0.5],
2527
+ 'min_child_weight': [1, 5, 10],
2528
+ 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2529
+ 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2530
+ }
2398
2531
  if cv_level in ["low", "simple", "s", "l"]:
2399
2532
  param_grids = {
2400
2533
  "Random Forest": (
@@ -2416,8 +2549,8 @@ def predict(
2416
2549
  }
2417
2550
  ),
2418
2551
  "SVM": {
2419
- "C": [1],
2420
- "gamma": ["scale"],
2552
+ "C": [0.1, 1, 10],
2553
+ "gamma": ["scale", 0.1, 1],
2421
2554
  "kernel": ["rbf"],
2422
2555
  },
2423
2556
  "Lasso": {
@@ -2439,12 +2572,17 @@ def predict(
2439
2572
  "min_samples_split": [2],
2440
2573
  "subsample": [0.8],
2441
2574
  },
2442
- "XGBoost": {
2443
- "n_estimators": [100],
2444
- "max_depth": [3],
2445
- "learning_rate": [0.1],
2446
- "subsample": [0.8],
2447
- "colsample_bytree": [0.8],
2575
+ "XGBoost":{
2576
+ 'learning_rate': [0.01],
2577
+ 'max_depth': [3],
2578
+ 'n_estimators': [50],
2579
+ 'subsample': [0.6],
2580
+ 'colsample_bytree': [0.6],
2581
+ 'gamma': [0, 0.1],
2582
+ 'min_child_weight': [1],
2583
+ 'reg_alpha': [0, 0.1],
2584
+ 'reg_lambda': [1],
2585
+ 'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
2448
2586
  },
2449
2587
  "KNN": (
2450
2588
  {
@@ -2551,6 +2689,14 @@ def predict(
2551
2689
  "random_state": [random_state],
2552
2690
  "learning_rate": ["constant"],
2553
2691
  },
2692
+ "TheilSen":{'max_iter': [100],
2693
+ 'tol': [1e-4],
2694
+ 'n_subsamples': [100+x_train.shape[1]]},
2695
+ "Huber":{'epsilon': [1.35],
2696
+ 'alpha': [0.1],
2697
+ 'max_iter': [100],},
2698
+ "Poisson":{'alpha': [0.1],
2699
+ 'max_iter': [100],}
2554
2700
  }
2555
2701
  elif cv_level in ["high", "advanced", "h"]:
2556
2702
  param_grids = {
@@ -2612,12 +2758,30 @@ def predict(
2612
2758
  "subsample": [0.8, 1.0],
2613
2759
  },
2614
2760
  "XGBoost": {
2615
- "n_estimators": [100, 200, 500, 700],
2616
- "max_depth": [3, 5, 7, 10],
2617
- "learning_rate": [0.01, 0.1, 0.2, 0.3],
2618
- "subsample": [0.8, 1.0],
2619
- "colsample_bytree": [0.8, 0.9, 1.0],
2620
- },
2761
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2762
+ 'max_depth': [3, 5, 7, 10],
2763
+ 'n_estimators': [50, 100, 200, 300],
2764
+ 'subsample': [0.6, 0.8, 1.0],
2765
+ 'gamma': [0, 0.1, 0.2, 0.5],
2766
+ 'min_child_weight': [1, 5, 10],
2767
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2768
+ 'reg_lambda': [1, 1.5, 2],
2769
+ **{
2770
+ 'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
2771
+ }} if purpose== "classification"
2772
+ else{
2773
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2774
+ 'max_depth': [3, 5, 7, 10],
2775
+ 'n_estimators': [50, 100, 200, 300],
2776
+ 'subsample': [0.6, 0.8, 1.0],
2777
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2778
+ 'gamma': [0, 0.1, 0.2, 0.5],
2779
+ 'min_child_weight': [1, 5, 10],
2780
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2781
+ 'reg_lambda': [1, 1.5, 2],
2782
+ **{
2783
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
2784
+ }},
2621
2785
  "KNN": (
2622
2786
  {
2623
2787
  "n_neighbors": [1, 3, 5, 10, 15, 20],
@@ -2730,6 +2894,14 @@ def predict(
2730
2894
  ], # If True, the regressors X will be normalized
2731
2895
  }
2732
2896
  ),
2897
+ "TheilSen":{'max_iter': [100, 200, 300],
2898
+ 'tol': [1e-4, 1e-3, 1e-2],
2899
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
2900
+ "Huber":{'epsilon': [1.35, 1.5, 2.0],
2901
+ 'alpha': [0.1, 1.0, 10.0],
2902
+ 'max_iter': [100, 200, 300],},
2903
+ "Poisson":{'alpha': [0.1, 1.0, 10.0],
2904
+ 'max_iter': [100, 200, 300],}
2733
2905
  }
2734
2906
  else: # median level
2735
2907
  param_grids = {
@@ -2789,12 +2961,30 @@ def predict(
2789
2961
  "subsample": [0.8, 1.0],
2790
2962
  },
2791
2963
  "XGBoost": {
2792
- "n_estimators": [100, 200, 500],
2793
- "max_depth": [3, 5, 7],
2794
- "learning_rate": [0.01, 0.1, 0.2],
2795
- "subsample": [0.8, 1.0],
2796
- "colsample_bytree": [0.8, 1.0],
2797
- },
2964
+ 'learning_rate': [0.01, 0.1],
2965
+ 'max_depth': [3, 5],
2966
+ 'n_estimators': [50, 100],
2967
+ 'subsample': [0.6, 0.8],
2968
+ 'gamma': [0, 0.1],
2969
+ 'min_child_weight': [1, 5],
2970
+ 'reg_alpha': [0, 0.1],
2971
+ 'reg_lambda': [1,],
2972
+ **{
2973
+ 'objective': ['binary:logistic', 'multi:softmax'],
2974
+ }} if purpose== "classification"
2975
+ else{
2976
+ 'learning_rate': [0.01, 0.1],
2977
+ 'max_depth': [3, 5,],
2978
+ 'n_estimators': [50, 100],
2979
+ 'subsample': [0.6, 0.8],
2980
+ 'colsample_bytree': [0.6, 0.8],
2981
+ 'gamma': [0, 0.1],
2982
+ 'min_child_weight': [1, 5],
2983
+ 'reg_alpha': [0, 0.1],
2984
+ 'reg_lambda': [1, 1.5],
2985
+ **{
2986
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror'],
2987
+ }},
2798
2988
  "KNN": (
2799
2989
  {
2800
2990
  "n_neighbors": [3, 5, 7, 10],
@@ -2951,6 +3141,14 @@ def predict(
2951
3141
  ], # Solver for optimization
2952
3142
  }
2953
3143
  ),
3144
+ "TheilSen":{'max_iter': [100, 200],
3145
+ 'tol': [1e-4, 1e-3],
3146
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
3147
+ "Huber":{'epsilon': [1.35, 1.5],
3148
+ 'alpha': [0.1, 1.0],
3149
+ 'max_iter': [100, 200],},
3150
+ "Poisson":{'alpha': [0.1, 1.0],
3151
+ 'max_iter': [100, 200],}
2954
3152
  }
2955
3153
 
2956
3154
  results = {}
@@ -3191,12 +3389,18 @@ def predict(
3191
3389
  # Convert results to DataFrame
3192
3390
  df_results = pd.DataFrame.from_dict(results, orient="index")
3193
3391
  # sort
3194
- if y_true is not None and purpose == "classification":
3195
- df_scores = pd.DataFrame(
3196
- df_results["scores"].tolist(), index=df_results["scores"].index
3197
- ).sort_values(by="roc_auc", ascending=False)
3392
+ if y_true is not None:
3393
+ if purpose == "classification":
3394
+ df_scores = pd.DataFrame(
3395
+ df_results["scores"].tolist(), index=df_results["scores"].index
3396
+ ).sort_values(by="roc_auc", ascending=False)
3397
+ elif purpose=='regression':
3398
+ df_scores = rank_models_reg(
3399
+ pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
3400
+ ascending=False)
3198
3401
  df_results = df_results.loc[df_scores.index]
3199
3402
 
3403
+ if y_true is not None and purpose == "classification":
3200
3404
  if plot_:
3201
3405
  from datetime import datetime
3202
3406
 
@@ -3214,18 +3418,503 @@ def predict(
3214
3418
  plot.figsets(xangle=30)
3215
3419
  if dir_save:
3216
3420
  ips.figsave(dir_save + f"scores_clus{now_}.pdf")
3421
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3422
+ # # try:
3423
+ # if len(models) > 3:
3424
+ # plot_validate_features(df_results, is_binary=is_binary)
3425
+ # else:
3426
+ # plot_validate_features_single(df_results, is_binary=is_binary)
3427
+ # if dir_save:
3428
+ # ips.figsave(dir_save + f"validate_features{now_}.pdf")
3429
+ # # except Exception as e:
3430
+ # # print(f"Error: 在画图的过程中出现了问题:{e}")
3431
+ if stack:
3432
+ #! stacking classifier/regressor
3433
+ from sklearn.metrics import make_scorer, accuracy_score
3434
+ from sklearn.model_selection import cross_val_score
3435
+
3436
+ #* n_top_models防止超过index
3437
+ n_top_models = min(n_top_models, df_results.shape[0])
3438
+
3439
+ #* 选择出排名靠前的n个, estimators
3440
+ models_selecte = select_top_models(models=list(df_results.index),
3441
+ categories=models_support[purpose],
3442
+ n_top_models=n_top_models,
3443
+ n_models_per_category=n_models_per_category)
3444
+ top_models = df_results.loc[models_selecte]["best_clf"]
3445
+ base_estimators = []
3446
+ for i, j in top_models.to_dict().items():
3447
+ base_estimators.append((i, j))
3448
+ if stacking_cv:
3449
+ print(f" ⤵ stacking_cv is processing...")
3450
+ #* 定义几个象征性的final_estimator
3451
+ # 备选的几种
3452
+ if purpose == "classification":
3453
+ kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
3454
+ else:
3455
+ kadt_estimators=["XGBoost","LassoCV"]
3456
+ final_estimators={}
3457
+ for name in kadt_estimators:
3458
+ param_grid=param_grids.get(name, {})
3459
+ print(param_grid)
3460
+ if is_binary:
3461
+ gs = GridSearchCV(
3462
+ model_[name],
3463
+ param_grid=param_grid,
3464
+ scoring=(
3465
+ "roc_auc"
3466
+ if purpose == "classification"
3467
+ else "neg_mean_squared_error"
3468
+ ),
3469
+ cv=cv,
3470
+ n_jobs=n_jobs,
3471
+ verbose=verbose,
3472
+ )
3473
+ else:
3474
+ gs = GridSearchCV(
3475
+ model_[name],
3476
+ param_grid=param_grid,
3477
+ scoring=(
3478
+ "roc_auc_ovr"
3479
+ if purpose == "classification"
3480
+ else "neg_mean_squared_error"
3481
+ ),
3482
+ cv=cv,
3483
+ n_jobs=n_jobs,
3484
+ verbose=verbose,
3485
+ )
3486
+ # Fit GridSearchCV
3487
+ gs.fit(x_train, y_train)
3488
+ final_estimators[name]=gs.best_estimator_
3489
+
3490
+ #* Set up cross-validation and performance evaluation
3491
+ scorer = make_scorer(accuracy_score)
3492
+ cv_results = []
3493
+
3494
+ #*Cross-validate stacking models with different final estimators
3495
+ for final_name, final_estimator in final_estimators.items():
3496
+ print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
3497
+ if purpose == "classification":
3498
+ stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
3499
+ else:
3500
+ stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
3501
+
3502
+ scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
3503
+
3504
+ # Store the result
3505
+ cv_results.append({
3506
+ 'final_estimator':final_estimator,
3507
+ 'Final Estimator': final_name,
3508
+ 'Mean Accuracy': np.mean(scores),
3509
+ 'Standard Deviation': np.std(scores)
3510
+ })
3511
+
3512
+ #* Convert the results into a DataFrame for easy comparison
3513
+ cv_results_df = pd.DataFrame(cv_results)
3514
+
3515
+ #* Sort and display the best model
3516
+ cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
3517
+
3518
+
3519
+ # Optionally: Select the final estimator that gives the best performance
3520
+ best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3521
+ print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3522
+ else:
3523
+ print(f" ⤵ trying to find the best_final_estimator for stacking...")
3524
+ if purpose=="classification":
3525
+ best_final_estimator = LogisticRegression(class_weight=class_weight,
3526
+ random_state=random_state,
3527
+ max_iter=1000)
3528
+ else:
3529
+ best_final_estimator = RidgeCV(cv=5)
3530
+ print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3531
+ #! apply stacking
3532
+ if purpose == "classification":
3533
+ print(f" ⤵ StackingClassifier...")
3534
+ stacking_model = StackingClassifier(estimators=base_estimators,
3535
+ final_estimator=best_final_estimator,
3536
+ cv=cv)
3537
+ else:
3538
+ print(f" ⤵ StackingRegressor...")
3539
+ stacking_model = StackingRegressor(estimators=base_estimators,
3540
+ final_estimator=best_final_estimator,
3541
+ cv=cv)
3542
+
3543
+ # Train the Stacking Classifier
3544
+ print(f" ⤵ fit & predict...")
3545
+ stacking_model.fit(x_train, y_train)
3546
+ y_pred_final = stacking_model.predict(x_true)
3547
+ print(f" ⤵ collecting results...")
3548
+ # pred_proba
3549
+ if is_binary:
3550
+ if hasattr(stacking_model, "predict_proba"):
3551
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3552
+ print("Shape of predicted probabilities:", y_pred_proba_final.shape)
3553
+ if y_pred_proba_final.shape[1] == 1:
3554
+ y_pred_proba_final = np.hstack(
3555
+ [1 - y_pred_proba_final, y_pred_proba_final]
3556
+ ) # Add missing class probabilities
3557
+ y_pred_proba_final = y_pred_proba_final[:, 1]
3558
+ elif hasattr(stacking_model, "decision_function"):
3559
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3560
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3561
+ # Ensure y_pred_proba_final is within 0 and 1 bounds
3562
+ y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
3563
+ y_pred_proba_final.max() - y_pred_proba_final.min()
3564
+ )
3565
+ else:
3566
+ y_pred_proba_final = None # No probability output for certain models
3567
+ if not is_binary:
3568
+ # Handle prediction probabilities for multiclass
3569
+ if hasattr(stacking_model, "predict_proba"):
3570
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3571
+ elif hasattr(stacking_model, "decision_function"):
3572
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3573
+
3574
+ # Normalize for multiclass if necessary
3575
+ if y_pred_proba_final.ndim == 2:
3576
+ y_pred_proba_final = (
3577
+ y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
3578
+ ) / (
3579
+ y_pred_proba_final.max(axis=1, keepdims=True)
3580
+ - y_pred_proba_final.min(axis=1, keepdims=True)
3581
+ )
3582
+ else:
3583
+ y_pred_proba_final = None # No probability output for certain models
3584
+ #! dict_pred_stack
3585
+ dict_pred_stack={}
3586
+ validation_scores_final = {}
3587
+ if y_true is not None and y_pred_proba_final is not None:
3588
+ validation_scores_final = cal_metrics(
3589
+ y_true,
3590
+ y_pred_final,
3591
+ y_pred_proba=y_pred_proba_final,
3592
+ is_binary=is_binary,
3593
+ purpose=purpose,
3594
+ average="weighted",
3595
+ )
3596
+ if is_binary:
3597
+ # Calculate ROC curve
3598
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
3599
+ if y_pred_proba_final is not None:
3600
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3601
+ lower_ci, upper_ci = cal_auc_ci(
3602
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3603
+ )
3604
+ roc_auc = auc(fpr, tpr)
3605
+ roc_info = {
3606
+ "fpr": fpr.tolist(),
3607
+ "tpr": tpr.tolist(),
3608
+ "auc": roc_auc,
3609
+ "ci95": (lower_ci, upper_ci),
3610
+ }
3611
+ # precision-recall curve
3612
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
3613
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
3614
+ pr_info = {
3615
+ "precision": precision_,
3616
+ "recall": recall_,
3617
+ "avg_precision": avg_precision_,
3618
+ }
3619
+ else:
3620
+ roc_info, pr_info = None, None
3621
+ if purpose == "classification":
3622
+ dict_pred_stack = {
3623
+ "best_clf": stacking_model,
3624
+ "best_params": None,
3625
+ "auc_indiv": None,
3626
+ "scores": validation_scores_final,
3627
+ "roc_curve": roc_info,
3628
+ "pr_curve": pr_info,
3629
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3630
+ "predictions": y_pred_final.tolist(),
3631
+ "predictions_proba": (
3632
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3633
+ ),
3634
+ }
3635
+ else: # "regression"
3636
+ dict_pred_stack = {
3637
+ "best_clf": stacking_model,
3638
+ "best_params": None,
3639
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3640
+ "predictions": y_pred_final.tolist(),
3641
+ "predictions_proba": (
3642
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3643
+ ),
3644
+ }
3645
+ else: # multi-classes
3646
+ if y_pred_proba_final is not None:
3647
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
3648
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3649
+ confidence_intervals = cal_auc_ci(
3650
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3651
+ )
3652
+ roc_info = {
3653
+ "fpr": validation_scores_final["fpr"],
3654
+ "tpr": validation_scores_final["tpr"],
3655
+ "auc": validation_scores_final["roc_auc_by_class"],
3656
+ "ci95": confidence_intervals,
3657
+ }
3658
+ # precision-recall curve
3659
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3660
+ y_true, y_pred_proba_final, is_binary=is_binary
3661
+ )
3662
+ pr_info = {
3663
+ "precision": precision_,
3664
+ "recall": recall_,
3665
+ "avg_precision": avg_precision_,
3666
+ }
3667
+ else:
3668
+ roc_info, pr_info = None, None
3669
+
3670
+ if purpose == "classification":
3671
+ dict_pred_stack = {
3672
+ "best_clf": stacking_model,
3673
+ "best_params": None,
3674
+ "auc_indiv": None,
3675
+ "scores": validation_scores_final,
3676
+ "roc_curve": roc_info,
3677
+ "pr_curve": pr_info,
3678
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3679
+ "predictions": y_pred_final.tolist(),
3680
+ "predictions_proba": (
3681
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3682
+ ),
3683
+ }
3684
+ else: # "regression"
3685
+ dict_pred_stack = {
3686
+ "best_clf": stacking_model,
3687
+ "best_params": None,
3688
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3689
+ "predictions": y_pred_final.tolist(),
3690
+ "predictions_proba": (
3691
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3692
+ ),
3693
+ }
3694
+
3695
+ else:
3696
+ if y_true is None:
3697
+ validation_scores_final = []
3698
+ else:
3699
+ validation_scores_final = cal_metrics(
3700
+ y_true,
3701
+ y_pred,
3702
+ y_pred_proba=y_pred_proba_final,
3703
+ is_binary=is_binary,
3704
+ purpose=purpose,
3705
+ average="weighted",
3706
+ )
3707
+ dict_pred_stack = {
3708
+ "best_clf": stacking_model,
3709
+ "best_params": None,
3710
+ "scores": validation_scores_final,
3711
+ "predictions": y_pred_final.tolist(),
3712
+ "predictions_proba": (
3713
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3714
+ ),
3715
+ "y_train": y_train if y_train is not None else [],
3716
+ "y_true": y_true if y_true is not None else [],
3717
+ }
3718
+ # merge together
3719
+ df_pred = pd.DataFrame(
3720
+ [None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
3721
+ for k, v in dict_pred_stack.items():
3722
+ if k in df_pred.columns:
3723
+ df_pred[k] = [v]
3724
+
3725
+ # # plot the stacking
3726
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3727
+ # plot_validate_features_single(df_pred, is_binary=is_binary)
3728
+ # if dir_save:
3729
+ # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3730
+ if vote:
3731
+ print(f" ⤵ voting...")
3732
+ from sklearn.ensemble import VotingClassifier, VotingRegressor
3733
+ #! Votting
3734
+ n_top_models = min(n_top_models, df_results.shape[0])
3735
+ base_estimators=[]
3736
+ for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3737
+ base_estimators.append((name,cls))
3738
+ # Apply Voting Classifier/Regressor
3739
+ if purpose == "classification":
3740
+ print(f" ⤵ VotingClassifier...via{votting}")
3741
+ if voting=='hard':
3742
+ # Hard voting does not support `predict_proba`
3743
+ voting_model = VotingClassifier(estimators=base_estimators)
3744
+ else:
3745
+ # Soft voting supports `predict_proba`
3746
+ voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3747
+ else:
3748
+ print(f" ⤵ VotingRegressor...")
3749
+ voting_model = VotingRegressor(estimators=base_estimators)
3750
+
3751
+ # Train the Voting Classifier/Regressor
3752
+ try:
3753
+ voting_model.fit(x_train, y_train)
3754
+ y_pred_vote = voting_model.predict(x_true)
3755
+ except Exception as e:
3756
+ if purpose == "classification" and not voting=='hard':
3757
+ voting_model = VotingClassifier(estimators=base_estimators)
3758
+ voting_model.fit(x_train, y_train)
3759
+ y_pred_vote = voting_model.predict(x_true)
3760
+
3761
+ # Calculate predicted probabilities if applicable
3762
+ if purpose == "classification":
3763
+ if hasattr(voting_model, "predict_proba"):
3764
+ y_pred_proba_vote = voting_model.predict_proba(x_true)
3765
+ print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
3766
+ if y_pred_proba_vote.shape[1] == 1:
3767
+ y_pred_proba_vote = np.hstack(
3768
+ [1 - y_pred_proba_vote, y_pred_proba_vote]
3769
+ ) # Add missing class probabilities
3770
+ y_pred_proba_vote = y_pred_proba_vote[:, 1]
3771
+ else:
3772
+ y_pred_proba_vote = None
3773
+ else: # Regression
3774
+ y_pred_proba_vote = None
3775
+
3776
+ print(f" ⤵ collecting voting results...")
3777
+ #! dict_pred_vote
3778
+ dict_pred_vote = {}
3779
+ validation_scores_vote = {}
3780
+ if y_true is not None and y_pred_proba_vote is not None:
3781
+ validation_scores_vote = cal_metrics(
3782
+ y_true,
3783
+ y_pred_vote,
3784
+ y_pred_proba=y_pred_proba_vote,
3785
+ is_binary=is_binary,
3786
+ purpose=purpose,
3787
+ average="weighted",
3788
+ )
3789
+
3790
+ if is_binary:
3791
+ if y_pred_proba_vote is not None:
3792
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
3793
+ lower_ci, upper_ci = cal_auc_ci(
3794
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3795
+ )
3796
+ roc_auc = auc(fpr, tpr)
3797
+ roc_info = {
3798
+ "fpr": fpr.tolist(),
3799
+ "tpr": tpr.tolist(),
3800
+ "auc": roc_auc,
3801
+ "ci95": (lower_ci, upper_ci),
3802
+ }
3803
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
3804
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
3805
+ pr_info = {
3806
+ "precision": precision_,
3807
+ "recall": recall_,
3808
+ "avg_precision": avg_precision_,
3809
+ }
3810
+ else:
3811
+ roc_info, pr_info = None, None
3812
+
3813
+ dict_pred_vote = {
3814
+ "best_clf": voting_model,
3815
+ "best_params": None,
3816
+ "auc_indiv": None,
3817
+ "scores": validation_scores_vote,
3818
+ "roc_curve": roc_info,
3819
+ "pr_curve": pr_info,
3820
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3821
+ "predictions": y_pred_vote.tolist(),
3822
+ "predictions_proba": (
3823
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3824
+ ),
3825
+ }
3826
+ else: # Multi-class
3827
+ if y_pred_proba_vote is not None:
3828
+ confidence_intervals = cal_auc_ci(
3829
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3830
+ )
3831
+ roc_info = {
3832
+ "fpr": validation_scores_vote["fpr"],
3833
+ "tpr": validation_scores_vote["tpr"],
3834
+ "auc": validation_scores_vote["roc_auc_by_class"],
3835
+ "ci95": confidence_intervals,
3836
+ }
3837
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3838
+ y_true, y_pred_proba_vote, is_binary=is_binary
3839
+ )
3840
+ pr_info = {
3841
+ "precision": precision_,
3842
+ "recall": recall_,
3843
+ "avg_precision": avg_precision_,
3844
+ }
3845
+ else:
3846
+ roc_info, pr_info = None, None
3847
+
3848
+ dict_pred_vote = {
3849
+ "best_clf": voting_model,
3850
+ "best_params": None,
3851
+ "scores": validation_scores_vote,
3852
+ "roc_curve": roc_info,
3853
+ "pr_curve": pr_info,
3854
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3855
+ "predictions": y_pred_vote.tolist(),
3856
+ "predictions_proba": (
3857
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3858
+ ),
3859
+ }
3860
+ else:
3861
+ if y_true is None:
3862
+ validation_scores_vote = []
3863
+ else:
3864
+ validation_scores_vote = cal_metrics(
3865
+ y_true,
3866
+ y_pred,
3867
+ y_pred_proba=y_pred_proba_vote,
3868
+ is_binary=is_binary,
3869
+ purpose=purpose,
3870
+ average="weighted",
3871
+ )
3872
+ dict_pred_vote = {
3873
+ "best_clf": voting_model,
3874
+ "best_params": None,
3875
+ "scores": validation_scores_vote,
3876
+ "predictions": y_pred_vote.tolist(),
3877
+ "predictions_proba": (
3878
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3879
+ ),
3880
+ "y_train": y_train if y_train is not None else [],
3881
+ "y_true": y_true if y_true is not None else [],
3882
+ }
3883
+ df_vote = pd.DataFrame(
3884
+ [None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
3885
+ for k, v in dict_pred_vote.items():
3886
+ if k in df_vote.columns:
3887
+ df_vote[k] = [v]
3888
+
3889
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3890
+ # try:
3891
+ # plot_validate_features_single(df_vote, is_binary=is_binary)
3892
+ # if dir_save:
3893
+ # ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
3894
+ # except Exception as e:
3895
+ # print(e)
3896
+ print("Done")
3897
+ if vote and stack:
3898
+ df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
3899
+ elif vote:
3900
+ df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
3901
+ elif stack:
3902
+ df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
3903
+
3217
3904
  if all([plot_, y_true is not None, purpose == "classification"]):
3905
+ from datetime import datetime
3906
+
3907
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3218
3908
  # try:
3219
- if len(models) > 3:
3220
- plot_validate_features(df_results, is_binary=is_binary)
3909
+ if df_res.shape[0] > 3:
3910
+ plot_validate_features(df_res, is_binary=is_binary)
3221
3911
  else:
3222
- plot_validate_features_single(df_results, is_binary=is_binary)
3912
+ plot_validate_features_single(df_res, is_binary=is_binary)
3223
3913
  if dir_save:
3224
3914
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3225
- # except Exception as e:
3226
- # print(f"Error: 在画图的过程中出现了问题:{e}")
3227
- return df_results
3228
-
3915
+ # except Exception as e:
3916
+ # print(f"Error: 在画图的过程中出现了问题:{e}")
3917
+ return df_res
3229
3918
 
3230
3919
  def cal_metrics(
3231
3920
  y_true,
@@ -3367,7 +4056,7 @@ def cal_metrics(
3367
4056
 
3368
4057
 
3369
4058
  def plot_trees(
3370
- X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
4059
+ X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3371
4060
  ):
3372
4061
  """
3373
4062
  # # Example usage:
@@ -3413,10 +4102,14 @@ def plot_trees(
3413
4102
  train_error_rate = []
3414
4103
  test_error_rate = []
3415
4104
  validation_error = None
3416
-
4105
+ if isinstance(cls, str):
4106
+ cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
3417
4107
  # Configure classifier based on type
3418
4108
  oob_enabled = False # Default to no OOB error unless explicitly set
3419
-
4109
+ clf_support = {"RandomForestClassifier":RandomForestClassifier(),
4110
+ "ExtraTreesClassifier":ExtraTreesClassifier(),
4111
+ "AdaBoostClassifier":AdaBoostClassifier(),
4112
+ "GradientBoostingClassifier":GradientBoostingClassifier()}
3420
4113
  if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3421
4114
  # Enable OOB if cls supports it and is using bootstrapping
3422
4115
  cls.set_params(warm_start=True, n_estimators=1)
@@ -3678,7 +4371,7 @@ def img_datasets_preprocessing(
3678
4371
 
3679
4372
 
3680
4373
  def backward_regression(
3681
- X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
4374
+ X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
3682
4375
  ):
3683
4376
  """
3684
4377
  # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
@@ -3690,31 +4383,46 @@ def backward_regression(
3690
4383
  X -- features values
3691
4384
  y -- target variable
3692
4385
  initial_list -- features header
3693
- threshold_out -- pvalue threshold of features to drop
4386
+ thr -- pvalue threshold of features to drop
3694
4387
  verbose -- true to produce lots of logging output
3695
4388
 
3696
4389
  Returns:
3697
4390
  list of selected features for modeling
3698
4391
  """
3699
4392
  import statsmodels.api as sm
3700
-
3701
- if isinstance(y, str) and y in X.columns:
3702
- y_col_name = y
3703
- y = X[y]
3704
- X = X.drop(y_col_name, axis=1)
4393
+ if isinstance(y, str):
4394
+ if y in X.columns:
4395
+ y_col_name = y
4396
+ y = X[y]
4397
+ X = X.drop(y_col_name, axis=1)
4398
+ else:
4399
+ raise ValueError(f"找不到{y},y设置有误")
4400
+ X = X.select_dtypes(include=[np.number])
4401
+
3705
4402
  included = list(X.columns)
4403
+ try:
4404
+ X=X.astype(float)
4405
+ y=y.astype(float)
4406
+ except Exception as e:
4407
+ raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
4408
+
4409
+
3706
4410
  while True:
3707
4411
  changed = False
4412
+ if not included:
4413
+ print("No features remain in the model.")
4414
+ break
4415
+
3708
4416
  model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3709
4417
  # exclude the intercept for p-value checking
3710
4418
  pvalues = model.pvalues.iloc[1:]
3711
4419
  worst_pval = pvalues.max()
3712
- if worst_pval > threshold_out:
4420
+ if worst_pval > thr:
3713
4421
  changed = True
3714
4422
  worst_feature = pvalues.idxmax()
3715
4423
  included.remove(worst_feature)
3716
4424
  if verbose:
3717
- print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
4425
+ print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
3718
4426
  if not changed:
3719
4427
  break
3720
4428
  print(f"\nSelected Features:\n{included}")