py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
5
5
  BaggingClassifier,
6
6
  )
7
7
  from sklearn.svm import SVC, SVR
8
- from sklearn.calibration import CalibratedClassifierCV
9
8
  from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
9
  from sklearn.linear_model import (
11
10
  LassoCV,
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
16
15
  RidgeClassifierCV,
17
16
  ElasticNet,
18
17
  )
19
- from sklearn.feature_selection import RFE
20
- from sklearn.naive_bayes import GaussianNB
21
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
22
- import xgboost as xgb # Make sure you have xgboost installed
23
-
24
- from sklearn.model_selection import train_test_split, cross_val_score
18
+
25
19
  from sklearn.metrics import (
26
20
  accuracy_score,
27
21
  precision_score,
@@ -36,18 +30,12 @@ from sklearn.metrics import (
36
30
  precision_recall_curve,
37
31
  average_precision_score,
38
32
  )
39
- from imblearn.over_sampling import SMOTE
40
- from sklearn.pipeline import Pipeline
41
- from collections import defaultdict
42
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
43
33
  from typing import Dict, Any, Optional, List, Union
44
34
  import numpy as np
45
35
  import pandas as pd
46
36
  from . import ips
47
37
  from . import plot
48
38
  import matplotlib.pyplot as plt
49
- import seaborn as sns
50
-
51
39
  plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
52
40
  import logging
53
41
  import warnings
@@ -314,6 +302,8 @@ def features_svm(
314
302
  - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
315
303
  S-shaped relationships.
316
304
  """
305
+ from sklearn.feature_selection import RFE
306
+ from sklearn.svm import SVC
317
307
  # SVM (Support Vector Machines)
318
308
  svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
319
309
  # RFE(Recursive Feature Elimination)
@@ -450,6 +440,7 @@ def validate_classifier(
450
440
  Returns:
451
441
  - results: Dictionary containing average cv_train_scores and cv_test_scores.
452
442
  """
443
+ from sklearn.model_selection import cross_val_score
453
444
  cv_train_scores = {metric: [] for metric in metrics}
454
445
  skf = StratifiedKFold(n_splits=cv_folds)
455
446
  # Perform cross-validation
@@ -982,6 +973,8 @@ def validate_features(
982
973
 
983
974
  """
984
975
  from tqdm import tqdm
976
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
977
+ from sklearn.calibration import CalibratedClassifierCV
985
978
 
986
979
  # Ensure common features are selected
987
980
  common_features = ips.shared(
@@ -1001,6 +994,7 @@ def validate_features(
1001
994
 
1002
995
  # Handle class imbalance using SMOTE
1003
996
  if smote:
997
+ from imblearn.over_sampling import SMOTE
1004
998
  if (
1005
999
  y_train.value_counts(normalize=True).max() < 0.8
1006
1000
  ): # Threshold to decide if data is imbalanced
@@ -2096,19 +2090,136 @@ def rank_models(
2096
2090
  # )
2097
2091
 
2098
2092
  # figsave("classifier_performance.pdf")
2093
+ def rank_models_reg(df, ascending=False):
2094
+ """
2095
+ Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
2099
2096
 
2097
+ Parameters:
2098
+ df (pd.DataFrame): DataFrame containing the regression metrics.
2099
+ ascending (bool): Whether to sort in ascending order of ranking score.
2100
+
2101
+ Returns:
2102
+ pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
2103
+ """
2104
+ # Define weights for the 4 metrics
2105
+ weights = {
2106
+ "mse": -1, # Lower is better
2107
+ "rmse": -1, # Lower is better
2108
+ "mae": -1, # Lower is better
2109
+ "r2": 1, # Higher is better
2110
+ }
2111
+
2112
+ # Normalize the selected metrics
2113
+ df = df.copy() # Work on a copy of the DataFrame
2114
+ for metric, weight in weights.items():
2115
+ if metric in df.columns:
2116
+ if weight > 0: # Higher is better; normalize 0-1
2117
+ df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
2118
+ df[metric].max() - df[metric].min()
2119
+ )
2120
+ else: # Lower is better; reverse normalize 0-1
2121
+ df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
2122
+ df[metric].max() - df[metric].min()
2123
+ )
2124
+
2125
+ # Calculate ranking score as a weighted sum
2126
+ df["Ranking_Score"] = sum(
2127
+ df[metric + "_normalized"] * abs(weights[metric])
2128
+ for metric in weights.keys()
2129
+ if metric + "_normalized" in df.columns
2130
+ )
2131
+
2132
+ # Sort models based on the ranking score
2133
+ sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
2134
+ return sorted_df
2135
+
2136
+ models_support = {
2137
+ "classification": {
2138
+ "Random Forest": "Tree-Based",
2139
+ "SVM": "Kernel-Based",
2140
+ "Logistic Regression": "Linear",
2141
+ "Lasso Logistic Regression": "Linear",
2142
+ "Gradient Boosting": "Tree-Based",
2143
+ "XGBoost": "Tree-Based",
2144
+ "KNN": "Instance-Based",
2145
+ "Naive Bayes": "Probabilistic",
2146
+ "Linear Discriminant Analysis": "Linear",
2147
+ "AdaBoost": "Tree-Based",
2148
+ "CatBoost": "Tree-Based",
2149
+ "Extra Trees": "Tree-Based",
2150
+ "Bagging": "Tree-Based",
2151
+ "Neural Network": "Neural Network",
2152
+ "DecisionTree": "Tree-Based",
2153
+ "Quadratic Discriminant Analysis": "Probabilistic",
2154
+ "Ridge": "Linear",
2155
+ "Perceptron": "Linear",
2156
+ "Bernoulli Naive Bayes": "Probabilistic",
2157
+ "SGDClassifier": "Linear",
2158
+ },
2159
+ "regression": {
2160
+ "Linear Regression": "Linear",
2161
+ "Ridge": "Linear",
2162
+ "RidgeCV": "Linear",
2163
+ "TheilSenRegressor": "Linear",
2164
+ "HuberRegressor": "Linear",
2165
+ "PoissonRegressor": "Linear",
2166
+ "LassoCV": "Linear",
2167
+ "Bagging": "Tree-Based",
2168
+ "ElasticNet": "Linear",
2169
+ "Random Forest": "Tree-Based",
2170
+ "Gradient Boosting": "Tree-Based",
2171
+ "XGBoost": "Tree-Based",
2172
+ "CatBoost": "Tree-Based",
2173
+ "Extra Trees": "Tree-Based",
2174
+ "SVM": "Kernel-Based",
2175
+ "KNN": "Instance-Based",
2176
+ "Neural Network": "Neural Network",
2177
+ "AdaBoost": "Linear",
2178
+ },
2179
+ }
2180
+ def select_top_models(models, categories, n_top_models, n_models_per_category=1):
2181
+ """
2182
+ models = list_sort
2183
+ purpose = "regression"
2184
+ categories = models_support[purpose]
2185
+ n_top_models = 3
2186
+ select_top_models(models, categories, n_top_models)
2187
+ """
2188
+ selected = {}
2189
+ result = []
2190
+ for model in models:
2191
+ category = categories.get(model, "Unknown")
2192
+ if category not in selected:
2193
+ selected[category] = 0 # Initialize counter for the category
2194
+
2195
+ if selected[category] < n_models_per_category: # Allow additional models up to the limit
2196
+ selected[category] += 1
2197
+ result.append(model)
2198
+
2199
+ if len(result) == n_top_models: # Stop when the desired number of models is reached
2200
+ break
2201
+
2202
+ return result
2100
2203
 
2101
2204
  def predict(
2102
2205
  x_train: pd.DataFrame,
2103
2206
  y_train: pd.Series,
2104
2207
  x_true: pd.DataFrame = None,
2105
2208
  y_true: Optional[pd.Series] = None,
2209
+ fill_missing:bool = True,
2210
+ scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
2106
2211
  backward: bool = False, # backward_regression
2212
+ backward_thr:float = 0.05,# pval thr,only works when backward is True
2107
2213
  common_features: set = None,
2108
2214
  purpose: str = "classification", # 'classification' or 'regression'
2109
2215
  cls: Optional[Dict[str, Any]] = None,
2110
2216
  metrics: Optional[List[str]] = None,
2111
- random_state: int = 1,
2217
+ stack:bool=True,# run stacking
2218
+ stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2219
+ vote:bool=True,# run voting
2220
+ voting:str="hard", # only for classification purporse of voting
2221
+ n_top_models:int=5, #for stacking models
2222
+ n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
2112
2223
  smote: bool = False,
2113
2224
  n_jobs: int = -1,
2114
2225
  plot_: bool = True,
@@ -2117,6 +2228,7 @@ def predict(
2117
2228
  cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
2118
2229
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2119
2230
  class_weight: str = "balanced",
2231
+ random_state: int = 1,
2120
2232
  verbose: bool = False,
2121
2233
  ) -> pd.DataFrame:
2122
2234
  """
@@ -2184,10 +2296,17 @@ def predict(
2184
2296
  RidgeClassifierCV,
2185
2297
  Perceptron,
2186
2298
  SGDClassifier,
2299
+ RidgeCV,
2300
+ Ridge,
2301
+ TheilSenRegressor,
2302
+ HuberRegressor,
2303
+ PoissonRegressor,
2304
+
2187
2305
  )
2306
+ from sklearn.compose import TransformedTargetRegressor
2188
2307
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
2189
2308
  from sklearn.naive_bayes import GaussianNB, BernoulliNB
2190
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
2309
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
2191
2310
  import xgboost as xgb
2192
2311
  import lightgbm as lgb
2193
2312
  import catboost as cb
@@ -2198,6 +2317,7 @@ def predict(
2198
2317
  QuadraticDiscriminantAnalysis,
2199
2318
  )
2200
2319
  from sklearn.preprocessing import PolynomialFeatures
2320
+ from sklearn.model_selection import train_test_split
2201
2321
 
2202
2322
  # 拼写检查
2203
2323
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
@@ -2206,7 +2326,7 @@ def predict(
2206
2326
  if purpose == "classification":
2207
2327
  model_ = {
2208
2328
  "Random Forest": RandomForestClassifier(
2209
- random_state=random_state, class_weight=class_weight
2329
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2210
2330
  ),
2211
2331
  # SVC (Support Vector Classification)
2212
2332
  "SVM": SVC(
@@ -2217,7 +2337,7 @@ def predict(
2217
2337
  ),
2218
2338
  # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
2219
2339
  "Logistic Regression": LogisticRegression(
2220
- class_weight=class_weight, random_state=random_state
2340
+ class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
2221
2341
  ),
2222
2342
  # Logistic Regression with L1 Regularization (Lasso)
2223
2343
  "Lasso Logistic Regression": LogisticRegression(
@@ -2228,53 +2348,54 @@ def predict(
2228
2348
  eval_metric="logloss",
2229
2349
  random_state=random_state,
2230
2350
  ),
2231
- "KNN": KNeighborsClassifier(n_neighbors=5),
2351
+ "KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
2232
2352
  "Naive Bayes": GaussianNB(),
2233
2353
  "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
2234
2354
  "AdaBoost": AdaBoostClassifier(
2235
2355
  algorithm="SAMME", random_state=random_state
2236
2356
  ),
2237
- # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
2357
+ "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
2238
2358
  "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
2239
2359
  "Extra Trees": ExtraTreesClassifier(
2240
- random_state=random_state, class_weight=class_weight
2360
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2241
2361
  ),
2242
- "Bagging": BaggingClassifier(random_state=random_state),
2362
+ "Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
2243
2363
  "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
2244
2364
  "DecisionTree": DecisionTreeClassifier(),
2245
2365
  "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
2246
2366
  "Ridge": RidgeClassifierCV(
2247
2367
  class_weight=class_weight, store_cv_results=True
2248
2368
  ),
2249
- "Perceptron": Perceptron(random_state=random_state),
2369
+ "Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
2250
2370
  "Bernoulli Naive Bayes": BernoulliNB(),
2251
- "SGDClassifier": SGDClassifier(random_state=random_state),
2371
+ "SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
2252
2372
  }
2253
2373
  elif purpose == "regression":
2254
2374
  model_ = {
2255
- "Random Forest": RandomForestRegressor(random_state=random_state),
2375
+ "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2256
2376
  "SVM": SVR(), # SVR (Support Vector Regression)
2257
2377
  # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2258
2378
  "LassoCV": LassoCV(
2259
- cv=cv_folds, random_state=random_state
2379
+ cv=cv_folds, random_state=random_state,n_jobs=n_jobs
2260
2380
  ), # LassoCV自动找出最适alpha,优于Lasso
2261
2381
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2262
- "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2263
- "Linear Regression": LinearRegression(),
2264
- "Lasso": Lasso(random_state=random_state),
2382
+ "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2383
+ "Linear Regression": LinearRegression(n_jobs=n_jobs),
2265
2384
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2266
- # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2385
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
2386
+ force_row_wise=True # Or use force_col_wise=True if memory is a concern
2387
+ ),
2267
2388
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2268
- "Extra Trees": ExtraTreesRegressor(random_state=random_state),
2269
- "Bagging": BaggingRegressor(random_state=random_state),
2389
+ "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2390
+ "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2270
2391
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2271
2392
  "ElasticNet": ElasticNet(random_state=random_state),
2272
2393
  "Ridge": Ridge(),
2273
- "KNN": KNeighborsRegressor(),
2394
+ "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2395
+ "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2396
+ "Huber":HuberRegressor(),
2397
+ "Poisson":PoissonRegressor()
2274
2398
  }
2275
- # indicate cls:
2276
- if ips.run_once_within(30): # 10 min
2277
- print(f"supported models: {list(model_.keys())}")
2278
2399
  if cls is None:
2279
2400
  models = model_
2280
2401
  else:
@@ -2290,6 +2411,10 @@ def predict(
2290
2411
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2291
2412
  )
2292
2413
 
2414
+ # indicate cls:
2415
+ if ips.run_once_within(30): # 10 min
2416
+ print(f"processing: {list(models.keys())}")
2417
+ print(isinstance(y_train, str) and y_train in x_train.columns)
2293
2418
  if isinstance(y_train, str) and y_train in x_train.columns:
2294
2419
  y_train_col_name = y_train
2295
2420
  y_train = x_train[y_train]
@@ -2297,6 +2422,7 @@ def predict(
2297
2422
  x_train = x_train.drop(y_train_col_name, axis=1)
2298
2423
  # else:
2299
2424
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2425
+
2300
2426
  y_train = pd.DataFrame(y_train)
2301
2427
  if y_train.select_dtypes(include=np.number).empty:
2302
2428
  y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
@@ -2309,9 +2435,12 @@ def predict(
2309
2435
  y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2310
2436
  print("is_binary:", is_binary)
2311
2437
 
2438
+ if fill_missing:
2439
+ ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
2440
+ ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
2312
2441
  # Perform backward feature selection
2313
2442
  if backward:
2314
- selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2443
+ selected_features = backward_regression(x_train, y_train, thr=backward_thr)
2315
2444
  x_train = x_train[selected_features]
2316
2445
 
2317
2446
  if x_true is None:
@@ -2337,6 +2466,8 @@ def predict(
2337
2466
  pd.DataFrame(y_train), method="label"
2338
2467
  ).values.ravel()
2339
2468
 
2469
+ if fill_missing:
2470
+ ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
2340
2471
  if y_true is not None:
2341
2472
  if isinstance(y_true, str) and y_true in x_true.columns:
2342
2473
  y_true_col_name = y_true
@@ -2369,11 +2500,16 @@ def predict(
2369
2500
  # Ensure common features are selected
2370
2501
  if common_features is not None:
2371
2502
  x_train, x_true = x_train[common_features], x_true[common_features]
2503
+ share_col_names=common_features
2372
2504
  else:
2373
2505
  share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
2374
2506
  x_train, x_true = x_train[share_col_names], x_true[share_col_names]
2375
2507
 
2376
- x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
2508
+ #! scaler
2509
+ # scaler and fit x_train and export scaler to fit the x_true
2510
+ x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
2511
+ #
2512
+ x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
2377
2513
  x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
2378
2514
  x_true, method="dummy"
2379
2515
  )
@@ -2395,7 +2531,19 @@ def predict(
2395
2531
  if isinstance(y_train, np.ndarray):
2396
2532
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2397
2533
  y_true = np.asarray(y_true)
2534
+
2398
2535
  # Hyperparameter grids for tuning
2536
+ param_grid_common_xgb = {
2537
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2538
+ 'max_depth': [3, 5, 7, 10],
2539
+ 'n_estimators': [50, 100, 200, 300],
2540
+ 'subsample': [0.6, 0.8, 1.0],
2541
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2542
+ 'gamma': [0, 0.1, 0.2, 0.5],
2543
+ 'min_child_weight': [1, 5, 10],
2544
+ 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2545
+ 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2546
+ }
2399
2547
  if cv_level in ["low", "simple", "s", "l"]:
2400
2548
  param_grids = {
2401
2549
  "Random Forest": (
@@ -2440,12 +2588,17 @@ def predict(
2440
2588
  "min_samples_split": [2],
2441
2589
  "subsample": [0.8],
2442
2590
  },
2443
- "XGBoost": {
2444
- "n_estimators": [100],
2445
- "max_depth": [3],
2446
- "learning_rate": [0.1],
2447
- "subsample": [0.8],
2448
- "colsample_bytree": [0.8],
2591
+ "XGBoost":{
2592
+ 'learning_rate': [0.01],
2593
+ 'max_depth': [3],
2594
+ 'n_estimators': [50],
2595
+ 'subsample': [0.6],
2596
+ 'colsample_bytree': [0.6],
2597
+ 'gamma': [0, 0.1],
2598
+ 'min_child_weight': [1],
2599
+ 'reg_alpha': [0, 0.1],
2600
+ 'reg_lambda': [1],
2601
+ 'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
2449
2602
  },
2450
2603
  "KNN": (
2451
2604
  {
@@ -2552,6 +2705,14 @@ def predict(
2552
2705
  "random_state": [random_state],
2553
2706
  "learning_rate": ["constant"],
2554
2707
  },
2708
+ "TheilSen":{'max_iter': [100],
2709
+ 'tol': [1e-4],
2710
+ 'n_subsamples': [100+x_train.shape[1]]},
2711
+ "Huber":{'epsilon': [1.35],
2712
+ 'alpha': [0.1],
2713
+ 'max_iter': [100],},
2714
+ "Poisson":{'alpha': [0.1],
2715
+ 'max_iter': [100],}
2555
2716
  }
2556
2717
  elif cv_level in ["high", "advanced", "h"]:
2557
2718
  param_grids = {
@@ -2613,12 +2774,30 @@ def predict(
2613
2774
  "subsample": [0.8, 1.0],
2614
2775
  },
2615
2776
  "XGBoost": {
2616
- "n_estimators": [100, 200, 500, 700],
2617
- "max_depth": [3, 5, 7, 10],
2618
- "learning_rate": [0.01, 0.1, 0.2, 0.3],
2619
- "subsample": [0.8, 1.0],
2620
- "colsample_bytree": [0.8, 0.9, 1.0],
2621
- },
2777
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2778
+ 'max_depth': [3, 5, 7, 10],
2779
+ 'n_estimators': [50, 100, 200, 300],
2780
+ 'subsample': [0.6, 0.8, 1.0],
2781
+ 'gamma': [0, 0.1, 0.2, 0.5],
2782
+ 'min_child_weight': [1, 5, 10],
2783
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2784
+ 'reg_lambda': [1, 1.5, 2],
2785
+ **{
2786
+ 'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
2787
+ }} if purpose== "classification"
2788
+ else{
2789
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2790
+ 'max_depth': [3, 5, 7, 10],
2791
+ 'n_estimators': [50, 100, 200, 300],
2792
+ 'subsample': [0.6, 0.8, 1.0],
2793
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2794
+ 'gamma': [0, 0.1, 0.2, 0.5],
2795
+ 'min_child_weight': [1, 5, 10],
2796
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2797
+ 'reg_lambda': [1, 1.5, 2],
2798
+ **{
2799
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
2800
+ }},
2622
2801
  "KNN": (
2623
2802
  {
2624
2803
  "n_neighbors": [1, 3, 5, 10, 15, 20],
@@ -2731,6 +2910,14 @@ def predict(
2731
2910
  ], # If True, the regressors X will be normalized
2732
2911
  }
2733
2912
  ),
2913
+ "TheilSen":{'max_iter': [100, 200, 300],
2914
+ 'tol': [1e-4, 1e-3, 1e-2],
2915
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
2916
+ "Huber":{'epsilon': [1.35, 1.5, 2.0],
2917
+ 'alpha': [0.1, 1.0, 10.0],
2918
+ 'max_iter': [100, 200, 300],},
2919
+ "Poisson":{'alpha': [0.1, 1.0, 10.0],
2920
+ 'max_iter': [100, 200, 300],}
2734
2921
  }
2735
2922
  else: # median level
2736
2923
  param_grids = {
@@ -2790,12 +2977,30 @@ def predict(
2790
2977
  "subsample": [0.8, 1.0],
2791
2978
  },
2792
2979
  "XGBoost": {
2793
- "n_estimators": [100, 200, 500],
2794
- "max_depth": [3, 5, 7],
2795
- "learning_rate": [0.01, 0.1, 0.2],
2796
- "subsample": [0.8, 1.0],
2797
- "colsample_bytree": [0.8, 1.0],
2798
- },
2980
+ 'learning_rate': [0.01, 0.1],
2981
+ 'max_depth': [3, 5],
2982
+ 'n_estimators': [50, 100],
2983
+ 'subsample': [0.6, 0.8],
2984
+ 'gamma': [0, 0.1],
2985
+ 'min_child_weight': [1, 5],
2986
+ 'reg_alpha': [0, 0.1],
2987
+ 'reg_lambda': [1,],
2988
+ **{
2989
+ 'objective': ['binary:logistic', 'multi:softmax'],
2990
+ }} if purpose== "classification"
2991
+ else{
2992
+ 'learning_rate': [0.01, 0.1],
2993
+ 'max_depth': [3, 5,],
2994
+ 'n_estimators': [50, 100],
2995
+ 'subsample': [0.6, 0.8],
2996
+ 'colsample_bytree': [0.6, 0.8],
2997
+ 'gamma': [0, 0.1],
2998
+ 'min_child_weight': [1, 5],
2999
+ 'reg_alpha': [0, 0.1],
3000
+ 'reg_lambda': [1, 1.5],
3001
+ **{
3002
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror'],
3003
+ }},
2799
3004
  "KNN": (
2800
3005
  {
2801
3006
  "n_neighbors": [3, 5, 7, 10],
@@ -2952,6 +3157,14 @@ def predict(
2952
3157
  ], # Solver for optimization
2953
3158
  }
2954
3159
  ),
3160
+ "TheilSen":{'max_iter': [100, 200],
3161
+ 'tol': [1e-4, 1e-3],
3162
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
3163
+ "Huber":{'epsilon': [1.35, 1.5],
3164
+ 'alpha': [0.1, 1.0],
3165
+ 'max_iter': [100, 200],},
3166
+ "Poisson":{'alpha': [0.1, 1.0],
3167
+ 'max_iter': [100, 200],}
2955
3168
  }
2956
3169
 
2957
3170
  results = {}
@@ -2971,83 +3184,124 @@ def predict(
2971
3184
  ):
2972
3185
  if verbose:
2973
3186
  print(f"\nTraining and validating {name}:")
2974
-
2975
- # Grid search with KFold or StratifiedKFold
2976
- if is_binary:
2977
- gs = GridSearchCV(
2978
- clf,
2979
- param_grid=param_grids.get(name, {}),
2980
- scoring=(
2981
- "roc_auc"
2982
- if purpose == "classification"
2983
- else "neg_mean_squared_error"
2984
- ),
2985
- cv=cv,
2986
- n_jobs=n_jobs,
2987
- verbose=verbose,
2988
- )
2989
-
2990
- gs.fit(x_train, y_train)
2991
- best_clf = gs.best_estimator_
2992
- # make sure x_train and x_test has the same name
2993
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2994
- y_pred = best_clf.predict(x_true)
2995
- if hasattr(best_clf, "predict_proba"):
2996
- y_pred_proba = best_clf.predict_proba(x_true)
2997
- print("Shape of predicted probabilities:", y_pred_proba.shape)
2998
- if y_pred_proba.shape[1] == 1:
2999
- y_pred_proba = np.hstack(
3000
- [1 - y_pred_proba, y_pred_proba]
3001
- ) # Add missing class probabilities
3002
- y_pred_proba = y_pred_proba[:, 1]
3003
- elif hasattr(best_clf, "decision_function"):
3004
- # If predict_proba is not available, use decision_function (e.g., for SVM)
3005
- y_pred_proba = best_clf.decision_function(x_true)
3006
- # Ensure y_pred_proba is within 0 and 1 bounds
3007
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3008
- y_pred_proba.max() - y_pred_proba.min()
3187
+ try:
3188
+ # Grid search with KFold or StratifiedKFold
3189
+ if is_binary:
3190
+ gs = GridSearchCV(
3191
+ clf,
3192
+ param_grid=param_grids.get(name, {}),
3193
+ scoring=(
3194
+ "roc_auc"
3195
+ if purpose == "classification"
3196
+ else "neg_mean_squared_error"
3197
+ ),
3198
+ cv=cv,
3199
+ n_jobs=n_jobs,
3200
+ verbose=verbose,
3009
3201
  )
3010
- else:
3011
- y_pred_proba = None # No probability output for certain models
3012
- else:
3013
- gs = GridSearchCV(
3014
- clf,
3015
- param_grid=param_grids.get(name, {}),
3016
- scoring=(
3017
- "roc_auc_ovr"
3018
- if purpose == "classification"
3019
- else "neg_mean_squared_error"
3020
- ),
3021
- cv=cv,
3022
- n_jobs=n_jobs,
3023
- verbose=verbose,
3024
- )
3025
-
3026
- # Fit GridSearchCV
3027
- gs.fit(x_train, y_train)
3028
- best_clf = gs.best_estimator_
3029
-
3030
- # Ensure x_true aligns with x_train columns
3031
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3032
- y_pred = best_clf.predict(x_true)
3033
-
3034
- # Handle prediction probabilities for multiclass
3035
- if hasattr(best_clf, "predict_proba"):
3036
- y_pred_proba = best_clf.predict_proba(x_true)
3037
- elif hasattr(best_clf, "decision_function"):
3038
- y_pred_proba = best_clf.decision_function(x_true)
3039
3202
 
3040
- # Normalize for multiclass if necessary
3041
- if y_pred_proba.ndim == 2:
3042
- y_pred_proba = (
3043
- y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3044
- ) / (
3045
- y_pred_proba.max(axis=1, keepdims=True)
3046
- - y_pred_proba.min(axis=1, keepdims=True)
3203
+ gs.fit(x_train, y_train)
3204
+ best_clf = gs.best_estimator_
3205
+ # make sure x_train and x_test has the same name
3206
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3207
+ y_pred = best_clf.predict(x_true)
3208
+ if hasattr(best_clf, "predict_proba"):
3209
+ y_pred_proba = best_clf.predict_proba(x_true)
3210
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
3211
+ if y_pred_proba.shape[1] == 1:
3212
+ y_pred_proba = np.hstack(
3213
+ [1 - y_pred_proba, y_pred_proba]
3214
+ ) # Add missing class probabilities
3215
+ y_pred_proba = y_pred_proba[:, 1]
3216
+ elif hasattr(best_clf, "decision_function"):
3217
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3218
+ y_pred_proba = best_clf.decision_function(x_true)
3219
+ # Ensure y_pred_proba is within 0 and 1 bounds
3220
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3221
+ y_pred_proba.max() - y_pred_proba.min()
3047
3222
  )
3223
+ else:
3224
+ y_pred_proba = None # No probability output for certain models
3225
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3226
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3227
+ if hasattr(best_clf, "alphas_"):
3228
+ alphas_ = best_clf.alphas_
3229
+ elif hasattr(best_clf, "alpha_"):
3230
+ alphas_ = best_clf.alpha_
3231
+ elif hasattr(best_clf, "Cs_"):
3232
+ alphas_ = best_clf.Cs_
3233
+ else:
3234
+ alphas_= None
3235
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3048
3236
  else:
3049
- y_pred_proba = None # No probability output for certain models
3237
+ gs = GridSearchCV(
3238
+ clf,
3239
+ param_grid=param_grids.get(name, {}),
3240
+ scoring=(
3241
+ "roc_auc_ovr"
3242
+ if purpose == "classification"
3243
+ else "neg_mean_squared_error"
3244
+ ),
3245
+ cv=cv,
3246
+ n_jobs=n_jobs,
3247
+ verbose=verbose,
3248
+ )
3050
3249
 
3250
+ # Fit GridSearchCV
3251
+ gs.fit(x_train, y_train)
3252
+ best_clf = gs.best_estimator_
3253
+
3254
+ # Ensure x_true aligns with x_train columns
3255
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3256
+
3257
+ # do i need to fit the x_train, y_train again?
3258
+ best_clf=best_clf.fit(x_train, y_train)
3259
+ y_pred = best_clf.predict(x_true)
3260
+
3261
+ # Handle prediction probabilities for multiclass
3262
+ if hasattr(best_clf, "predict_proba"):
3263
+ y_pred_proba = best_clf.predict_proba(x_true)
3264
+ elif hasattr(best_clf, "decision_function"):
3265
+ y_pred_proba = best_clf.decision_function(x_true)
3266
+
3267
+ # Normalize for multiclass if necessary
3268
+ if y_pred_proba.ndim == 2:
3269
+ y_pred_proba = (
3270
+ y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3271
+ ) / (
3272
+ y_pred_proba.max(axis=1, keepdims=True)
3273
+ - y_pred_proba.min(axis=1, keepdims=True)
3274
+ )
3275
+ else:
3276
+ y_pred_proba = None # No probability output for certain models
3277
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3278
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3279
+ if hasattr(best_clf, "alphas_"):
3280
+ alphas_ = best_clf.alphas_
3281
+ elif hasattr(best_clf, "alpha_"):
3282
+ alphas_ = best_clf.alpha_
3283
+ elif hasattr(best_clf, "Cs_"):
3284
+ alphas_ = best_clf.Cs_
3285
+ else:
3286
+ alphas_= None
3287
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3288
+ except Exception as e:
3289
+ alphas_,coef_ = None,None
3290
+ print(f"skiped {clf}: {e}")
3291
+ continue
3292
+ # try to make predict format consistant
3293
+ try:
3294
+ y_pred= [i[0] for i in y_pred]
3295
+ except:
3296
+ pass
3297
+ try:
3298
+ y_true= [i[0] for i in y_true]
3299
+ except:
3300
+ pass
3301
+ try:
3302
+ y_train= [i[0] for i in y_train]
3303
+ except:
3304
+ pass
3051
3305
  validation_scores = {}
3052
3306
 
3053
3307
  if y_true is not None and y_pred_proba is not None:
@@ -3097,20 +3351,26 @@ def predict(
3097
3351
  "roc_curve": roc_info,
3098
3352
  "pr_curve": pr_info,
3099
3353
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3100
- "predictions": y_pred.tolist(),
3354
+ "predictions": y_pred,#.tolist(),
3101
3355
  "predictions_proba": (
3102
3356
  y_pred_proba.tolist() if y_pred_proba is not None else None
3103
3357
  ),
3358
+ "features":share_col_names,
3359
+ "coef":coef_,
3360
+ "alphas":alphas_
3104
3361
  }
3105
3362
  else: # "regression"
3106
3363
  results[name] = {
3107
3364
  "best_clf": gs.best_estimator_,
3108
3365
  "best_params": gs.best_params_,
3109
3366
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3110
- "predictions": y_pred.tolist(),
3367
+ "predictions": y_pred,#.tolist(),
3111
3368
  "predictions_proba": (
3112
3369
  y_pred_proba.tolist() if y_pred_proba is not None else None
3113
3370
  ),
3371
+ "features":share_col_names,
3372
+ "coef":coef_,
3373
+ "alphas":alphas_
3114
3374
  }
3115
3375
  else: # multi-classes
3116
3376
  if y_pred_proba is not None:
@@ -3149,20 +3409,26 @@ def predict(
3149
3409
  "roc_curve": roc_info,
3150
3410
  "pr_curve": pr_info,
3151
3411
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3152
- "predictions": y_pred.tolist(),
3412
+ "predictions": y_pred,#.tolist(),
3153
3413
  "predictions_proba": (
3154
3414
  y_pred_proba.tolist() if y_pred_proba is not None else None
3155
3415
  ),
3416
+ "features":share_col_names,
3417
+ "coef":coef_,
3418
+ "alphas":alphas_
3156
3419
  }
3157
3420
  else: # "regression"
3158
3421
  results[name] = {
3159
3422
  "best_clf": gs.best_estimator_,
3160
3423
  "best_params": gs.best_params_,
3161
3424
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3162
- "predictions": y_pred.tolist(),
3425
+ "predictions": y_pred,#.tolist(),
3163
3426
  "predictions_proba": (
3164
3427
  y_pred_proba.tolist() if y_pred_proba is not None else None
3165
3428
  ),
3429
+ "features":share_col_names,
3430
+ "coef":coef_,
3431
+ "alphas":alphas_
3166
3432
  }
3167
3433
 
3168
3434
  else:
@@ -3181,23 +3447,32 @@ def predict(
3181
3447
  "best_clf": gs.best_estimator_,
3182
3448
  "best_params": gs.best_params_,
3183
3449
  "scores": validation_scores,
3184
- "predictions": y_pred.tolist(),
3450
+ "predictions": y_pred,#.tolist(),
3185
3451
  "predictions_proba": (
3186
3452
  y_pred_proba.tolist() if y_pred_proba is not None else None
3187
3453
  ),
3454
+ "features":share_col_names,
3188
3455
  "y_train": y_train if y_train is not None else [],
3189
3456
  "y_true": y_true if y_true is not None else [],
3457
+ "coef":coef_,
3458
+ "alphas":alphas_
3190
3459
  }
3191
3460
 
3192
3461
  # Convert results to DataFrame
3193
3462
  df_results = pd.DataFrame.from_dict(results, orient="index")
3194
3463
  # sort
3195
- if y_true is not None and purpose == "classification":
3196
- df_scores = pd.DataFrame(
3197
- df_results["scores"].tolist(), index=df_results["scores"].index
3198
- ).sort_values(by="roc_auc", ascending=False)
3464
+ if y_true is not None:
3465
+ if purpose == "classification":
3466
+ df_scores = pd.DataFrame(
3467
+ df_results["scores"].tolist(), index=df_results["scores"].index
3468
+ ).sort_values(by="roc_auc", ascending=False)
3469
+ elif purpose=='regression':
3470
+ df_scores = rank_models_reg(
3471
+ pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
3472
+ ascending=False)
3199
3473
  df_results = df_results.loc[df_scores.index]
3200
3474
 
3475
+ if y_true is not None and purpose == "classification":
3201
3476
  if plot_:
3202
3477
  from datetime import datetime
3203
3478
 
@@ -3215,18 +3490,565 @@ def predict(
3215
3490
  plot.figsets(xangle=30)
3216
3491
  if dir_save:
3217
3492
  ips.figsave(dir_save + f"scores_clus{now_}.pdf")
3493
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3494
+ # # try:
3495
+ # if len(models) > 3:
3496
+ # plot_validate_features(df_results, is_binary=is_binary)
3497
+ # else:
3498
+ # plot_validate_features_single(df_results, is_binary=is_binary)
3499
+ # if dir_save:
3500
+ # ips.figsave(dir_save + f"validate_features{now_}.pdf")
3501
+ # # except Exception as e:
3502
+ # # print(f"Error: 在画图的过程中出现了问题:{e}")
3503
+ if stack:
3504
+ #! stacking classifier/regressor
3505
+ from sklearn.metrics import make_scorer, accuracy_score
3506
+ from sklearn.model_selection import cross_val_score
3507
+
3508
+ #* n_top_models防止超过index
3509
+ n_top_models = min(n_top_models, df_results.shape[0])
3510
+
3511
+ #* 选择出排名靠前的n个, estimators
3512
+ models_selecte = select_top_models(models=list(df_results.index),
3513
+ categories=models_support[purpose],
3514
+ n_top_models=n_top_models,
3515
+ n_models_per_category=n_models_per_category)
3516
+ top_models = df_results.loc[models_selecte]["best_clf"]
3517
+ base_estimators = []
3518
+ for i, j in top_models.to_dict().items():
3519
+ base_estimators.append((i, j))
3520
+ if stacking_cv:
3521
+ print(f"⤵ stacking_cv is processing...")
3522
+ #* 定义几个象征性的final_estimator
3523
+ # 备选的几种
3524
+ if purpose == "classification":
3525
+ kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
3526
+ else:
3527
+ kadt_estimators=["XGBoost","LassoCV"]
3528
+ final_estimators={}
3529
+ for name in kadt_estimators:
3530
+ param_grid=param_grids.get(name, {})
3531
+ print(param_grid)
3532
+ if is_binary:
3533
+ gs = GridSearchCV(
3534
+ model_[name],
3535
+ param_grid=param_grid,
3536
+ scoring=(
3537
+ "roc_auc"
3538
+ if purpose == "classification"
3539
+ else "neg_mean_squared_error"
3540
+ ),
3541
+ cv=cv,
3542
+ n_jobs=n_jobs,
3543
+ verbose=verbose,
3544
+ )
3545
+ else:
3546
+ gs = GridSearchCV(
3547
+ model_[name],
3548
+ param_grid=param_grid,
3549
+ scoring=(
3550
+ "roc_auc_ovr"
3551
+ if purpose == "classification"
3552
+ else "neg_mean_squared_error"
3553
+ ),
3554
+ cv=cv,
3555
+ n_jobs=n_jobs,
3556
+ verbose=verbose,
3557
+ )
3558
+ # Fit GridSearchCV
3559
+ gs.fit(x_train, y_train)
3560
+ final_estimators[name]=gs.best_estimator_
3561
+
3562
+ #* Set up cross-validation and performance evaluation
3563
+ scorer = make_scorer(accuracy_score)
3564
+ cv_results = []
3565
+
3566
+ #*Cross-validate stacking models with different final estimators
3567
+ for final_name, final_estimator in final_estimators.items():
3568
+ print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
3569
+ if purpose == "classification":
3570
+ stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
3571
+ else:
3572
+ stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
3573
+
3574
+ scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
3575
+
3576
+ # Store the result
3577
+ cv_results.append({
3578
+ 'final_estimator':final_estimator,
3579
+ 'Final Estimator': final_name,
3580
+ 'Mean Accuracy': np.mean(scores),
3581
+ 'Standard Deviation': np.std(scores)
3582
+ })
3583
+
3584
+ #* Convert the results into a DataFrame for easy comparison
3585
+ cv_results_df = pd.DataFrame(cv_results)
3586
+
3587
+ #* Sort and display the best model
3588
+ cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
3589
+
3590
+
3591
+ # Optionally: Select the final estimator that gives the best performance
3592
+ best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3593
+ print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3594
+ else:
3595
+ print(f"⤵ trying to find the best_final_estimator for stacking...")
3596
+ if purpose=="classification":
3597
+ best_final_estimator = LogisticRegression(class_weight=class_weight,
3598
+ random_state=random_state,
3599
+ max_iter=1000)
3600
+ else:
3601
+ best_final_estimator = RidgeCV(cv=5)
3602
+ print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3603
+ #! apply stacking
3604
+ if purpose == "classification":
3605
+ print(f"⤵ StackingClassifier...")
3606
+ stacking_model = StackingClassifier(estimators=base_estimators,
3607
+ final_estimator=best_final_estimator,
3608
+ cv=cv)
3609
+ else:
3610
+ print(f"⤵ StackingRegressor...")
3611
+ stacking_model = StackingRegressor(estimators=base_estimators,
3612
+ final_estimator=best_final_estimator,
3613
+ cv=cv)
3614
+
3615
+ # Train the Stacking Classifier
3616
+ print(f"⤵ fit & predict...")
3617
+ stacking_model.fit(x_train, y_train)
3618
+ y_pred_final = stacking_model.predict(x_true)
3619
+ print(f"⤵ collecting results...")
3620
+ # pred_proba
3621
+ if is_binary:
3622
+ if hasattr(stacking_model, "predict_proba"):
3623
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3624
+ if y_pred_proba_final.shape[1] == 1:
3625
+ y_pred_proba_final = np.hstack(
3626
+ [1 - y_pred_proba_final, y_pred_proba_final]
3627
+ ) # Add missing class probabilities
3628
+ y_pred_proba_final = y_pred_proba_final[:, 1]
3629
+ elif hasattr(stacking_model, "decision_function"):
3630
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3631
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3632
+ # Ensure y_pred_proba_final is within 0 and 1 bounds
3633
+ y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
3634
+ y_pred_proba_final.max() - y_pred_proba_final.min()
3635
+ )
3636
+ else:
3637
+ y_pred_proba_final = None # No probability output for certain models
3638
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3639
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3640
+ if hasattr(best_clf, "alphas_"):
3641
+ alphas_ = best_clf.alphas_
3642
+ elif hasattr(best_clf, "alpha_"):
3643
+ alphas_ = best_clf.alpha_
3644
+ elif hasattr(best_clf, "Cs_"):
3645
+ alphas_ = best_clf.Cs_
3646
+ else:
3647
+ alphas_= None
3648
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3649
+ if not is_binary:
3650
+ # Handle prediction probabilities for multiclass
3651
+ if hasattr(stacking_model, "predict_proba"):
3652
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3653
+ elif hasattr(stacking_model, "decision_function"):
3654
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3655
+
3656
+ # Normalize for multiclass if necessary
3657
+ if y_pred_proba_final.ndim == 2:
3658
+ y_pred_proba_final = (
3659
+ y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
3660
+ ) / (
3661
+ y_pred_proba_final.max(axis=1, keepdims=True)
3662
+ - y_pred_proba_final.min(axis=1, keepdims=True)
3663
+ )
3664
+ else:
3665
+ y_pred_proba_final = None # No probability output for certain models
3666
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3667
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3668
+ if hasattr(best_clf, "alphas_"):
3669
+ alphas_ = best_clf.alphas_
3670
+ elif hasattr(best_clf, "alpha_"):
3671
+ alphas_ = best_clf.alpha_
3672
+ elif hasattr(best_clf, "Cs_"):
3673
+ alphas_ = best_clf.Cs_
3674
+ else:
3675
+ alphas_= None
3676
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3677
+ #! dict_pred_stack
3678
+ dict_pred_stack={}
3679
+ validation_scores_final = {}
3680
+ if y_true is not None and y_pred_proba_final is not None:
3681
+ validation_scores_final = cal_metrics(
3682
+ y_true,
3683
+ y_pred_final,
3684
+ y_pred_proba=y_pred_proba_final,
3685
+ is_binary=is_binary,
3686
+ purpose=purpose,
3687
+ average="weighted",
3688
+ )
3689
+ if is_binary:
3690
+ # Calculate ROC curve
3691
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
3692
+ if y_pred_proba_final is not None:
3693
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3694
+ lower_ci, upper_ci = cal_auc_ci(
3695
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3696
+ )
3697
+ roc_auc = auc(fpr, tpr)
3698
+ roc_info = {
3699
+ "fpr": fpr.tolist(),
3700
+ "tpr": tpr.tolist(),
3701
+ "auc": roc_auc,
3702
+ "ci95": (lower_ci, upper_ci),
3703
+ }
3704
+ # precision-recall curve
3705
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
3706
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
3707
+ pr_info = {
3708
+ "precision": precision_,
3709
+ "recall": recall_,
3710
+ "avg_precision": avg_precision_,
3711
+ }
3712
+ else:
3713
+ roc_info, pr_info = None, None
3714
+ if purpose == "classification":
3715
+ dict_pred_stack = {
3716
+ "best_clf": stacking_model,
3717
+ "best_params": None,
3718
+ "auc_indiv": None,
3719
+ "scores": validation_scores_final,
3720
+ "roc_curve": roc_info,
3721
+ "pr_curve": pr_info,
3722
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3723
+ "predictions": y_pred_final.tolist(),
3724
+ "predictions_proba": (
3725
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3726
+ ),
3727
+ "features":share_col_names,
3728
+ "coef":coef_,
3729
+ "alphas":alphas_
3730
+ }
3731
+ else: # "regression"
3732
+ dict_pred_stack = {
3733
+ "best_clf": stacking_model,
3734
+ "best_params": None,
3735
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3736
+ "predictions": y_pred_final.tolist(),
3737
+ "predictions_proba": (
3738
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3739
+ ),
3740
+ "features":share_col_names,
3741
+ "coef":coef_,
3742
+ "alphas":alphas_
3743
+ }
3744
+ else: # multi-classes
3745
+ if y_pred_proba_final is not None:
3746
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
3747
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3748
+ confidence_intervals = cal_auc_ci(
3749
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3750
+ )
3751
+ roc_info = {
3752
+ "fpr": validation_scores_final["fpr"],
3753
+ "tpr": validation_scores_final["tpr"],
3754
+ "auc": validation_scores_final["roc_auc_by_class"],
3755
+ "ci95": confidence_intervals,
3756
+ }
3757
+ # precision-recall curve
3758
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3759
+ y_true, y_pred_proba_final, is_binary=is_binary
3760
+ )
3761
+ pr_info = {
3762
+ "precision": precision_,
3763
+ "recall": recall_,
3764
+ "avg_precision": avg_precision_,
3765
+ }
3766
+ else:
3767
+ roc_info, pr_info = None, None
3768
+
3769
+ if purpose == "classification":
3770
+ dict_pred_stack = {
3771
+ "best_clf": stacking_model,
3772
+ "best_params": None,
3773
+ "auc_indiv": None,
3774
+ "scores": validation_scores_final,
3775
+ "roc_curve": roc_info,
3776
+ "pr_curve": pr_info,
3777
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3778
+ "predictions": y_pred_final.tolist(),
3779
+ "predictions_proba": (
3780
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3781
+ ),
3782
+ "features":share_col_names,
3783
+ "coef":coef_,
3784
+ "alphas":alphas_
3785
+ }
3786
+ else: # "regression"
3787
+ dict_pred_stack = {
3788
+ "best_clf": stacking_model,
3789
+ "best_params": None,
3790
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3791
+ "predictions": y_pred_final.tolist(),
3792
+ "predictions_proba": (
3793
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3794
+ ),
3795
+ "features":share_col_names,
3796
+ "coef":coef_,
3797
+ "alphas":alphas_
3798
+ }
3799
+
3800
+ else:
3801
+ if y_true is None:
3802
+ validation_scores_final = []
3803
+ else:
3804
+ validation_scores_final = cal_metrics(
3805
+ y_true,
3806
+ y_pred,
3807
+ y_pred_proba=y_pred_proba_final,
3808
+ is_binary=is_binary,
3809
+ purpose=purpose,
3810
+ average="weighted",
3811
+ )
3812
+ dict_pred_stack = {
3813
+ "best_clf": stacking_model,
3814
+ "best_params": None,
3815
+ "scores": validation_scores_final,
3816
+ "predictions": y_pred_final.tolist(),
3817
+ "predictions_proba": (
3818
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3819
+ ),
3820
+ "features":share_col_names,
3821
+ "y_train": y_train if y_train is not None else [],
3822
+ "y_true": y_true if y_true is not None else [],
3823
+ "coef":coef_,
3824
+ "alphas":alphas_
3825
+ }
3826
+ # merge together
3827
+ df_pred = pd.DataFrame(
3828
+ [None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
3829
+ for k, v in dict_pred_stack.items():
3830
+ if k in df_pred.columns:
3831
+ df_pred[k] = [v]
3832
+
3833
+ # # plot the stacking
3834
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3835
+ # plot_validate_features_single(df_pred, is_binary=is_binary)
3836
+ # if dir_save:
3837
+ # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3838
+ if vote:
3839
+ print(f"⤵ voting...")
3840
+ from sklearn.ensemble import VotingClassifier, VotingRegressor
3841
+ #! voting
3842
+ n_top_models = min(n_top_models, df_results.shape[0])
3843
+ base_estimators=[]
3844
+ for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3845
+ base_estimators.append((name,cls))
3846
+ # Apply Voting Classifier/Regressor
3847
+ if purpose == "classification":
3848
+ print(f"⤵ VotingClassifier...via{voting}")
3849
+ if voting=='hard':
3850
+ # Hard voting does not support `predict_proba`
3851
+ voting_model = VotingClassifier(estimators=base_estimators)
3852
+ else:
3853
+ # Soft voting supports `predict_proba`
3854
+ voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3855
+ else:
3856
+ print(f"⤵ VotingRegressor...")
3857
+ voting_model = VotingRegressor(estimators=base_estimators)
3858
+
3859
+ # Train the Voting Classifier/Regressor
3860
+ try:
3861
+ voting_model.fit(x_train, y_train)
3862
+ y_pred_vote = voting_model.predict(x_true)
3863
+ except Exception as e:
3864
+ if purpose == "classification" and not voting=='hard':
3865
+ voting_model = VotingClassifier(estimators=base_estimators)
3866
+ voting_model.fit(x_train, y_train)
3867
+ y_pred_vote = voting_model.predict(x_true)
3868
+
3869
+ # Calculate predicted probabilities if applicable
3870
+ if purpose == "classification":
3871
+ if hasattr(voting_model, "predict_proba"):
3872
+ y_pred_proba_vote = voting_model.predict_proba(x_true)
3873
+ print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
3874
+ if y_pred_proba_vote.shape[1] == 1:
3875
+ y_pred_proba_vote = np.hstack(
3876
+ [1 - y_pred_proba_vote, y_pred_proba_vote]
3877
+ ) # Add missing class probabilities
3878
+ y_pred_proba_vote = y_pred_proba_vote[:, 1]
3879
+ else:
3880
+ y_pred_proba_vote = None
3881
+
3882
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3883
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3884
+ if hasattr(best_clf, "alphas_"):
3885
+ alphas_ = best_clf.alphas_
3886
+ elif hasattr(best_clf, "alpha_"):
3887
+ alphas_ = best_clf.alpha_
3888
+ elif hasattr(best_clf, "Cs_"):
3889
+ alphas_ = best_clf.Cs_
3890
+ else:
3891
+ alphas_= None
3892
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3893
+ else: # Regression
3894
+ y_pred_proba_vote = None
3895
+ coef_,alphas_=None,None
3896
+
3897
+ print(f"⤵ collecting voting results...")
3898
+ #! dict_pred_vote
3899
+ dict_pred_vote = {}
3900
+ validation_scores_vote = {}
3901
+ if y_true is not None and y_pred_proba_vote is not None:
3902
+ validation_scores_vote = cal_metrics(
3903
+ y_true,
3904
+ y_pred_vote,
3905
+ y_pred_proba=y_pred_proba_vote,
3906
+ is_binary=is_binary,
3907
+ purpose=purpose,
3908
+ average="weighted",
3909
+ )
3910
+
3911
+ if is_binary:
3912
+ if y_pred_proba_vote is not None:
3913
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
3914
+ lower_ci, upper_ci = cal_auc_ci(
3915
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3916
+ )
3917
+ roc_auc = auc(fpr, tpr)
3918
+ roc_info = {
3919
+ "fpr": fpr.tolist(),
3920
+ "tpr": tpr.tolist(),
3921
+ "auc": roc_auc,
3922
+ "ci95": (lower_ci, upper_ci),
3923
+ }
3924
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
3925
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
3926
+ pr_info = {
3927
+ "precision": precision_,
3928
+ "recall": recall_,
3929
+ "avg_precision": avg_precision_,
3930
+ }
3931
+ else:
3932
+ roc_info, pr_info = None, None
3933
+
3934
+ dict_pred_vote = {
3935
+ "best_clf": voting_model,
3936
+ "best_params": None,
3937
+ "auc_indiv": None,
3938
+ "scores": validation_scores_vote,
3939
+ "roc_curve": roc_info,
3940
+ "pr_curve": pr_info,
3941
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3942
+ "predictions": y_pred_vote.tolist(),
3943
+ "predictions_proba": (
3944
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3945
+ ),
3946
+ "features":share_col_names,
3947
+ "coef":coef_,
3948
+ "alphas":alphas_
3949
+ }
3950
+ else: # Multi-class
3951
+ if y_pred_proba_vote is not None:
3952
+ confidence_intervals = cal_auc_ci(
3953
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3954
+ )
3955
+ roc_info = {
3956
+ "fpr": validation_scores_vote["fpr"],
3957
+ "tpr": validation_scores_vote["tpr"],
3958
+ "auc": validation_scores_vote["roc_auc_by_class"],
3959
+ "ci95": confidence_intervals,
3960
+ }
3961
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3962
+ y_true, y_pred_proba_vote, is_binary=is_binary
3963
+ )
3964
+ pr_info = {
3965
+ "precision": precision_,
3966
+ "recall": recall_,
3967
+ "avg_precision": avg_precision_,
3968
+ }
3969
+ else:
3970
+ roc_info, pr_info = None, None
3971
+
3972
+ dict_pred_vote = {
3973
+ "best_clf": voting_model,
3974
+ "best_params": None,
3975
+ "scores": validation_scores_vote,
3976
+ "roc_curve": roc_info,
3977
+ "pr_curve": pr_info,
3978
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3979
+ "predictions": y_pred_vote.tolist(),
3980
+ "predictions_proba": (
3981
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3982
+ ),
3983
+ "features":share_col_names,
3984
+ "coef":coef_,
3985
+ "alphas":alphas_
3986
+ }
3987
+ else:
3988
+ if y_true is None:
3989
+ validation_scores_vote = []
3990
+ else:
3991
+ validation_scores_vote = cal_metrics(
3992
+ y_true,
3993
+ y_pred,
3994
+ y_pred_proba=y_pred_proba_vote,
3995
+ is_binary=is_binary,
3996
+ purpose=purpose,
3997
+ average="weighted",
3998
+ )
3999
+ dict_pred_vote = {
4000
+ "best_clf": voting_model,
4001
+ "best_params": None,
4002
+ "scores": validation_scores_vote,
4003
+ "predictions": y_pred_vote.tolist(),
4004
+ "predictions_proba": (
4005
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
4006
+ ),
4007
+ "features":share_col_names,
4008
+ "y_train": y_train if y_train is not None else [],
4009
+ "y_true": y_true if y_true is not None else [],
4010
+ }
4011
+ df_vote = pd.DataFrame(
4012
+ [None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
4013
+ for k, v in dict_pred_vote.items():
4014
+ if k in df_vote.columns:
4015
+ df_vote[k] = [v]
4016
+
4017
+ # if all([plot_, y_true is not None, purpose == "classification"]):
4018
+ # try:
4019
+ # plot_validate_features_single(df_vote, is_binary=is_binary)
4020
+ # if dir_save:
4021
+ # ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
4022
+ # except Exception as e:
4023
+ # print(e)
4024
+ print("Done")
4025
+ if vote and stack:
4026
+ df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
4027
+ elif vote:
4028
+ df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
4029
+ elif stack:
4030
+ df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
4031
+
3218
4032
  if all([plot_, y_true is not None, purpose == "classification"]):
4033
+ from datetime import datetime
4034
+
4035
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3219
4036
  # try:
3220
- if len(models) > 3:
3221
- plot_validate_features(df_results, is_binary=is_binary)
4037
+ if df_res.shape[0] > 3:
4038
+ try:
4039
+ plot_validate_features(df_res, is_binary=is_binary)
4040
+ except Exception as e:
4041
+ print(e)
3222
4042
  else:
3223
- plot_validate_features_single(df_results, is_binary=is_binary)
4043
+ try:
4044
+ plot_validate_features_single(df_res, is_binary=is_binary)
4045
+ except Exception as e:
4046
+ print(e)
3224
4047
  if dir_save:
3225
4048
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3226
- # except Exception as e:
3227
- # print(f"Error: 在画图的过程中出现了问题:{e}")
3228
- return df_results
3229
-
4049
+ # except Exception as e:
4050
+ # print(f"Error: 在画图的过程中出现了问题:{e}")
4051
+ return df_res
3230
4052
 
3231
4053
  def cal_metrics(
3232
4054
  y_true,
@@ -3368,7 +4190,7 @@ def cal_metrics(
3368
4190
 
3369
4191
 
3370
4192
  def plot_trees(
3371
- X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
4193
+ X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3372
4194
  ):
3373
4195
  """
3374
4196
  # # Example usage:
@@ -3414,10 +4236,14 @@ def plot_trees(
3414
4236
  train_error_rate = []
3415
4237
  test_error_rate = []
3416
4238
  validation_error = None
3417
-
4239
+ if isinstance(cls, str):
4240
+ cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
3418
4241
  # Configure classifier based on type
3419
4242
  oob_enabled = False # Default to no OOB error unless explicitly set
3420
-
4243
+ clf_support = {"RandomForestClassifier":RandomForestClassifier(),
4244
+ "ExtraTreesClassifier":ExtraTreesClassifier(),
4245
+ "AdaBoostClassifier":AdaBoostClassifier(),
4246
+ "GradientBoostingClassifier":GradientBoostingClassifier()}
3421
4247
  if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3422
4248
  # Enable OOB if cls supports it and is using bootstrapping
3423
4249
  cls.set_params(warm_start=True, n_estimators=1)
@@ -3679,7 +4505,7 @@ def img_datasets_preprocessing(
3679
4505
 
3680
4506
 
3681
4507
  def backward_regression(
3682
- X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
4508
+ X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
3683
4509
  ):
3684
4510
  """
3685
4511
  # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
@@ -3691,31 +4517,46 @@ def backward_regression(
3691
4517
  X -- features values
3692
4518
  y -- target variable
3693
4519
  initial_list -- features header
3694
- threshold_out -- pvalue threshold of features to drop
4520
+ thr -- pvalue threshold of features to drop
3695
4521
  verbose -- true to produce lots of logging output
3696
4522
 
3697
4523
  Returns:
3698
4524
  list of selected features for modeling
3699
4525
  """
3700
4526
  import statsmodels.api as sm
3701
-
3702
- if isinstance(y, str) and y in X.columns:
3703
- y_col_name = y
3704
- y = X[y]
3705
- X = X.drop(y_col_name, axis=1)
4527
+ if isinstance(y, str):
4528
+ if y in X.columns:
4529
+ y_col_name = y
4530
+ y = X[y]
4531
+ X = X.drop(y_col_name, axis=1)
4532
+ else:
4533
+ raise ValueError(f"找不到{y},y设置有误")
4534
+ X = X.select_dtypes(include=[np.number])
4535
+
3706
4536
  included = list(X.columns)
4537
+ try:
4538
+ X=X.astype(float)
4539
+ y=y.astype(float)
4540
+ except Exception as e:
4541
+ raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
4542
+
4543
+
3707
4544
  while True:
3708
4545
  changed = False
4546
+ if not included:
4547
+ print("No features remain in the model.")
4548
+ break
4549
+
3709
4550
  model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3710
4551
  # exclude the intercept for p-value checking
3711
4552
  pvalues = model.pvalues.iloc[1:]
3712
4553
  worst_pval = pvalues.max()
3713
- if worst_pval > threshold_out:
4554
+ if worst_pval > thr:
3714
4555
  changed = True
3715
4556
  worst_feature = pvalues.idxmax()
3716
4557
  included.remove(worst_feature)
3717
4558
  if verbose:
3718
- print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
4559
+ print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
3719
4560
  if not changed:
3720
4561
  break
3721
4562
  print(f"\nSelected Features:\n{included}")