py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -5,7 +5,6 @@ from sklearn.ensemble import (
5
5
  BaggingClassifier,
6
6
  )
7
7
  from sklearn.svm import SVC, SVR
8
- from sklearn.calibration import CalibratedClassifierCV
9
8
  from sklearn.model_selection import GridSearchCV, StratifiedKFold
10
9
  from sklearn.linear_model import (
11
10
  LassoCV,
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
16
15
  RidgeClassifierCV,
17
16
  ElasticNet,
18
17
  )
19
- from sklearn.feature_selection import RFE
20
- from sklearn.naive_bayes import GaussianNB
21
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
22
- import xgboost as xgb # Make sure you have xgboost installed
23
-
24
- from sklearn.model_selection import train_test_split, cross_val_score
18
+
25
19
  from sklearn.metrics import (
26
20
  accuracy_score,
27
21
  precision_score,
@@ -36,18 +30,12 @@ from sklearn.metrics import (
36
30
  precision_recall_curve,
37
31
  average_precision_score,
38
32
  )
39
- from imblearn.over_sampling import SMOTE
40
- from sklearn.pipeline import Pipeline
41
- from collections import defaultdict
42
- from sklearn.preprocessing import StandardScaler, OneHotEncoder
43
33
  from typing import Dict, Any, Optional, List, Union
44
34
  import numpy as np
45
35
  import pandas as pd
46
36
  from . import ips
47
37
  from . import plot
48
38
  import matplotlib.pyplot as plt
49
- import seaborn as sns
50
-
51
39
  plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
52
40
  import logging
53
41
  import warnings
@@ -314,6 +302,8 @@ def features_svm(
314
302
  - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
315
303
  S-shaped relationships.
316
304
  """
305
+ from sklearn.feature_selection import RFE
306
+ from sklearn.svm import SVC
317
307
  # SVM (Support Vector Machines)
318
308
  svc = SVC(kernel=rfe_params["kernel"]) # ["linear", "rbf", "poly", "sigmoid"]
319
309
  # RFE(Recursive Feature Elimination)
@@ -450,6 +440,7 @@ def validate_classifier(
450
440
  Returns:
451
441
  - results: Dictionary containing average cv_train_scores and cv_test_scores.
452
442
  """
443
+ from sklearn.model_selection import cross_val_score
453
444
  cv_train_scores = {metric: [] for metric in metrics}
454
445
  skf = StratifiedKFold(n_splits=cv_folds)
455
446
  # Perform cross-validation
@@ -982,6 +973,8 @@ def validate_features(
982
973
 
983
974
  """
984
975
  from tqdm import tqdm
976
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
977
+ from sklearn.calibration import CalibratedClassifierCV
985
978
 
986
979
  # Ensure common features are selected
987
980
  common_features = ips.shared(
@@ -1001,6 +994,7 @@ def validate_features(
1001
994
 
1002
995
  # Handle class imbalance using SMOTE
1003
996
  if smote:
997
+ from imblearn.over_sampling import SMOTE
1004
998
  if (
1005
999
  y_train.value_counts(normalize=True).max() < 0.8
1006
1000
  ): # Threshold to decide if data is imbalanced
@@ -2096,19 +2090,136 @@ def rank_models(
2096
2090
  # )
2097
2091
 
2098
2092
  # figsave("classifier_performance.pdf")
2093
+ def rank_models_reg(df, ascending=False):
2094
+ """
2095
+ Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
2099
2096
 
2097
+ Parameters:
2098
+ df (pd.DataFrame): DataFrame containing the regression metrics.
2099
+ ascending (bool): Whether to sort in ascending order of ranking score.
2100
+
2101
+ Returns:
2102
+ pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
2103
+ """
2104
+ # Define weights for the 4 metrics
2105
+ weights = {
2106
+ "mse": -1, # Lower is better
2107
+ "rmse": -1, # Lower is better
2108
+ "mae": -1, # Lower is better
2109
+ "r2": 1, # Higher is better
2110
+ }
2111
+
2112
+ # Normalize the selected metrics
2113
+ df = df.copy() # Work on a copy of the DataFrame
2114
+ for metric, weight in weights.items():
2115
+ if metric in df.columns:
2116
+ if weight > 0: # Higher is better; normalize 0-1
2117
+ df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
2118
+ df[metric].max() - df[metric].min()
2119
+ )
2120
+ else: # Lower is better; reverse normalize 0-1
2121
+ df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
2122
+ df[metric].max() - df[metric].min()
2123
+ )
2124
+
2125
+ # Calculate ranking score as a weighted sum
2126
+ df["Ranking_Score"] = sum(
2127
+ df[metric + "_normalized"] * abs(weights[metric])
2128
+ for metric in weights.keys()
2129
+ if metric + "_normalized" in df.columns
2130
+ )
2131
+
2132
+ # Sort models based on the ranking score
2133
+ sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
2134
+ return sorted_df
2135
+
2136
+ models_support = {
2137
+ "classification": {
2138
+ "Random Forest": "Tree-Based",
2139
+ "SVM": "Kernel-Based",
2140
+ "Logistic Regression": "Linear",
2141
+ "Lasso Logistic Regression": "Linear",
2142
+ "Gradient Boosting": "Tree-Based",
2143
+ "XGBoost": "Tree-Based",
2144
+ "KNN": "Instance-Based",
2145
+ "Naive Bayes": "Probabilistic",
2146
+ "Linear Discriminant Analysis": "Linear",
2147
+ "AdaBoost": "Tree-Based",
2148
+ "CatBoost": "Tree-Based",
2149
+ "Extra Trees": "Tree-Based",
2150
+ "Bagging": "Tree-Based",
2151
+ "Neural Network": "Neural Network",
2152
+ "DecisionTree": "Tree-Based",
2153
+ "Quadratic Discriminant Analysis": "Probabilistic",
2154
+ "Ridge": "Linear",
2155
+ "Perceptron": "Linear",
2156
+ "Bernoulli Naive Bayes": "Probabilistic",
2157
+ "SGDClassifier": "Linear",
2158
+ },
2159
+ "regression": {
2160
+ "Linear Regression": "Linear",
2161
+ "Ridge": "Linear",
2162
+ "RidgeCV": "Linear",
2163
+ "TheilSenRegressor": "Linear",
2164
+ "HuberRegressor": "Linear",
2165
+ "PoissonRegressor": "Linear",
2166
+ "LassoCV": "Linear",
2167
+ "Bagging": "Tree-Based",
2168
+ "ElasticNet": "Linear",
2169
+ "Random Forest": "Tree-Based",
2170
+ "Gradient Boosting": "Tree-Based",
2171
+ "XGBoost": "Tree-Based",
2172
+ "CatBoost": "Tree-Based",
2173
+ "Extra Trees": "Tree-Based",
2174
+ "SVM": "Kernel-Based",
2175
+ "KNN": "Instance-Based",
2176
+ "Neural Network": "Neural Network",
2177
+ "AdaBoost": "Linear",
2178
+ },
2179
+ }
2180
+ def select_top_models(models, categories, n_top_models, n_models_per_category=1):
2181
+ """
2182
+ models = list_sort
2183
+ purpose = "regression"
2184
+ categories = models_support[purpose]
2185
+ n_top_models = 3
2186
+ select_top_models(models, categories, n_top_models)
2187
+ """
2188
+ selected = {}
2189
+ result = []
2190
+ for model in models:
2191
+ category = categories.get(model, "Unknown")
2192
+ if category not in selected:
2193
+ selected[category] = 0 # Initialize counter for the category
2194
+
2195
+ if selected[category] < n_models_per_category: # Allow additional models up to the limit
2196
+ selected[category] += 1
2197
+ result.append(model)
2198
+
2199
+ if len(result) == n_top_models: # Stop when the desired number of models is reached
2200
+ break
2201
+
2202
+ return result
2100
2203
 
2101
2204
  def predict(
2102
2205
  x_train: pd.DataFrame,
2103
2206
  y_train: pd.Series,
2104
2207
  x_true: pd.DataFrame = None,
2105
2208
  y_true: Optional[pd.Series] = None,
2209
+ fill_missing:bool = True,
2210
+ scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
2106
2211
  backward: bool = False, # backward_regression
2212
+ backward_thr:float = 0.05,# pval thr,only works when backward is True
2107
2213
  common_features: set = None,
2108
2214
  purpose: str = "classification", # 'classification' or 'regression'
2109
2215
  cls: Optional[Dict[str, Any]] = None,
2110
2216
  metrics: Optional[List[str]] = None,
2111
- random_state: int = 1,
2217
+ stack:bool=True,# run stacking
2218
+ stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2219
+ vote:bool=True,# run voting
2220
+ voting:str="hard", # only for classification purporse of voting
2221
+ n_top_models:int=5, #for stacking models
2222
+ n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
2112
2223
  smote: bool = False,
2113
2224
  n_jobs: int = -1,
2114
2225
  plot_: bool = True,
@@ -2117,6 +2228,7 @@ def predict(
2117
2228
  cv_folds: int = 5, # more cv_folds 得更加稳定,auc可能更低
2118
2229
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2119
2230
  class_weight: str = "balanced",
2231
+ random_state: int = 1,
2120
2232
  verbose: bool = False,
2121
2233
  ) -> pd.DataFrame:
2122
2234
  """
@@ -2184,10 +2296,17 @@ def predict(
2184
2296
  RidgeClassifierCV,
2185
2297
  Perceptron,
2186
2298
  SGDClassifier,
2299
+ RidgeCV,
2300
+ Ridge,
2301
+ TheilSenRegressor,
2302
+ HuberRegressor,
2303
+ PoissonRegressor,
2304
+
2187
2305
  )
2306
+ from sklearn.compose import TransformedTargetRegressor
2188
2307
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
2189
2308
  from sklearn.naive_bayes import GaussianNB, BernoulliNB
2190
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
2309
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
2191
2310
  import xgboost as xgb
2192
2311
  import lightgbm as lgb
2193
2312
  import catboost as cb
@@ -2198,6 +2317,7 @@ def predict(
2198
2317
  QuadraticDiscriminantAnalysis,
2199
2318
  )
2200
2319
  from sklearn.preprocessing import PolynomialFeatures
2320
+ from sklearn.model_selection import train_test_split
2201
2321
 
2202
2322
  # 拼写检查
2203
2323
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
@@ -2206,7 +2326,7 @@ def predict(
2206
2326
  if purpose == "classification":
2207
2327
  model_ = {
2208
2328
  "Random Forest": RandomForestClassifier(
2209
- random_state=random_state, class_weight=class_weight
2329
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2210
2330
  ),
2211
2331
  # SVC (Support Vector Classification)
2212
2332
  "SVM": SVC(
@@ -2217,7 +2337,7 @@ def predict(
2217
2337
  ),
2218
2338
  # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
2219
2339
  "Logistic Regression": LogisticRegression(
2220
- class_weight=class_weight, random_state=random_state
2340
+ class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
2221
2341
  ),
2222
2342
  # Logistic Regression with L1 Regularization (Lasso)
2223
2343
  "Lasso Logistic Regression": LogisticRegression(
@@ -2228,53 +2348,54 @@ def predict(
2228
2348
  eval_metric="logloss",
2229
2349
  random_state=random_state,
2230
2350
  ),
2231
- "KNN": KNeighborsClassifier(n_neighbors=5),
2351
+ "KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
2232
2352
  "Naive Bayes": GaussianNB(),
2233
2353
  "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
2234
2354
  "AdaBoost": AdaBoostClassifier(
2235
2355
  algorithm="SAMME", random_state=random_state
2236
2356
  ),
2237
- # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
2357
+ "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
2238
2358
  "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
2239
2359
  "Extra Trees": ExtraTreesClassifier(
2240
- random_state=random_state, class_weight=class_weight
2360
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2241
2361
  ),
2242
- "Bagging": BaggingClassifier(random_state=random_state),
2362
+ "Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
2243
2363
  "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
2244
2364
  "DecisionTree": DecisionTreeClassifier(),
2245
2365
  "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
2246
2366
  "Ridge": RidgeClassifierCV(
2247
2367
  class_weight=class_weight, store_cv_results=True
2248
2368
  ),
2249
- "Perceptron": Perceptron(random_state=random_state),
2369
+ "Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
2250
2370
  "Bernoulli Naive Bayes": BernoulliNB(),
2251
- "SGDClassifier": SGDClassifier(random_state=random_state),
2371
+ "SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
2252
2372
  }
2253
2373
  elif purpose == "regression":
2254
2374
  model_ = {
2255
- "Random Forest": RandomForestRegressor(random_state=random_state),
2375
+ "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2256
2376
  "SVM": SVR(), # SVR (Support Vector Regression)
2257
2377
  # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2258
2378
  "LassoCV": LassoCV(
2259
- cv=cv_folds, random_state=random_state
2379
+ cv=cv_folds, random_state=random_state,n_jobs=n_jobs
2260
2380
  ), # LassoCV自动找出最适alpha,优于Lasso
2261
2381
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2262
- "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2263
- "Linear Regression": LinearRegression(),
2264
- "Lasso": Lasso(random_state=random_state),
2382
+ "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2383
+ "Linear Regression": LinearRegression(n_jobs=n_jobs),
2265
2384
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2266
- # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2385
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
2386
+ force_row_wise=True # Or use force_col_wise=True if memory is a concern
2387
+ ),
2267
2388
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2268
- "Extra Trees": ExtraTreesRegressor(random_state=random_state),
2269
- "Bagging": BaggingRegressor(random_state=random_state),
2389
+ "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2390
+ "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2270
2391
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2271
2392
  "ElasticNet": ElasticNet(random_state=random_state),
2272
2393
  "Ridge": Ridge(),
2273
- "KNN": KNeighborsRegressor(),
2394
+ "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2395
+ "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2396
+ "Huber":HuberRegressor(),
2397
+ "Poisson":PoissonRegressor()
2274
2398
  }
2275
- # indicate cls:
2276
- if ips.run_once_within(30): # 10 min
2277
- print(f"supported models: {list(model_.keys())}")
2278
2399
  if cls is None:
2279
2400
  models = model_
2280
2401
  else:
@@ -2290,6 +2411,10 @@ def predict(
2290
2411
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2291
2412
  )
2292
2413
 
2414
+ # indicate cls:
2415
+ if ips.run_once_within(30): # 10 min
2416
+ print(f"processing: {list(models.keys())}")
2417
+ print(isinstance(y_train, str) and y_train in x_train.columns)
2293
2418
  if isinstance(y_train, str) and y_train in x_train.columns:
2294
2419
  y_train_col_name = y_train
2295
2420
  y_train = x_train[y_train]
@@ -2297,6 +2422,7 @@ def predict(
2297
2422
  x_train = x_train.drop(y_train_col_name, axis=1)
2298
2423
  # else:
2299
2424
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2425
+
2300
2426
  y_train = pd.DataFrame(y_train)
2301
2427
  if y_train.select_dtypes(include=np.number).empty:
2302
2428
  y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
@@ -2309,9 +2435,12 @@ def predict(
2309
2435
  y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2310
2436
  print("is_binary:", is_binary)
2311
2437
 
2438
+ if fill_missing:
2439
+ ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
2440
+ ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
2312
2441
  # Perform backward feature selection
2313
2442
  if backward:
2314
- selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2443
+ selected_features = backward_regression(x_train, y_train, thr=backward_thr)
2315
2444
  x_train = x_train[selected_features]
2316
2445
 
2317
2446
  if x_true is None:
@@ -2337,6 +2466,8 @@ def predict(
2337
2466
  pd.DataFrame(y_train), method="label"
2338
2467
  ).values.ravel()
2339
2468
 
2469
+ if fill_missing:
2470
+ ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
2340
2471
  if y_true is not None:
2341
2472
  if isinstance(y_true, str) and y_true in x_true.columns:
2342
2473
  y_true_col_name = y_true
@@ -2369,11 +2500,16 @@ def predict(
2369
2500
  # Ensure common features are selected
2370
2501
  if common_features is not None:
2371
2502
  x_train, x_true = x_train[common_features], x_true[common_features]
2503
+ share_col_names=common_features
2372
2504
  else:
2373
2505
  share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
2374
2506
  x_train, x_true = x_train[share_col_names], x_true[share_col_names]
2375
2507
 
2376
- x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
2508
+ #! scaler
2509
+ # scaler and fit x_train and export scaler to fit the x_true
2510
+ x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
2511
+ #
2512
+ x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
2377
2513
  x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
2378
2514
  x_true, method="dummy"
2379
2515
  )
@@ -2395,7 +2531,19 @@ def predict(
2395
2531
  if isinstance(y_train, np.ndarray):
2396
2532
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2397
2533
  y_true = np.asarray(y_true)
2534
+
2398
2535
  # Hyperparameter grids for tuning
2536
+ param_grid_common_xgb = {
2537
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2538
+ 'max_depth': [3, 5, 7, 10],
2539
+ 'n_estimators': [50, 100, 200, 300],
2540
+ 'subsample': [0.6, 0.8, 1.0],
2541
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2542
+ 'gamma': [0, 0.1, 0.2, 0.5],
2543
+ 'min_child_weight': [1, 5, 10],
2544
+ 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2545
+ 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2546
+ }
2399
2547
  if cv_level in ["low", "simple", "s", "l"]:
2400
2548
  param_grids = {
2401
2549
  "Random Forest": (
@@ -2440,12 +2588,17 @@ def predict(
2440
2588
  "min_samples_split": [2],
2441
2589
  "subsample": [0.8],
2442
2590
  },
2443
- "XGBoost": {
2444
- "n_estimators": [100],
2445
- "max_depth": [3],
2446
- "learning_rate": [0.1],
2447
- "subsample": [0.8],
2448
- "colsample_bytree": [0.8],
2591
+ "XGBoost":{
2592
+ 'learning_rate': [0.01],
2593
+ 'max_depth': [3],
2594
+ 'n_estimators': [50],
2595
+ 'subsample': [0.6],
2596
+ 'colsample_bytree': [0.6],
2597
+ 'gamma': [0, 0.1],
2598
+ 'min_child_weight': [1],
2599
+ 'reg_alpha': [0, 0.1],
2600
+ 'reg_lambda': [1],
2601
+ 'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
2449
2602
  },
2450
2603
  "KNN": (
2451
2604
  {
@@ -2552,6 +2705,14 @@ def predict(
2552
2705
  "random_state": [random_state],
2553
2706
  "learning_rate": ["constant"],
2554
2707
  },
2708
+ "TheilSen":{'max_iter': [100],
2709
+ 'tol': [1e-4],
2710
+ 'n_subsamples': [100+x_train.shape[1]]},
2711
+ "Huber":{'epsilon': [1.35],
2712
+ 'alpha': [0.1],
2713
+ 'max_iter': [100],},
2714
+ "Poisson":{'alpha': [0.1],
2715
+ 'max_iter': [100],}
2555
2716
  }
2556
2717
  elif cv_level in ["high", "advanced", "h"]:
2557
2718
  param_grids = {
@@ -2613,12 +2774,30 @@ def predict(
2613
2774
  "subsample": [0.8, 1.0],
2614
2775
  },
2615
2776
  "XGBoost": {
2616
- "n_estimators": [100, 200, 500, 700],
2617
- "max_depth": [3, 5, 7, 10],
2618
- "learning_rate": [0.01, 0.1, 0.2, 0.3],
2619
- "subsample": [0.8, 1.0],
2620
- "colsample_bytree": [0.8, 0.9, 1.0],
2621
- },
2777
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2778
+ 'max_depth': [3, 5, 7, 10],
2779
+ 'n_estimators': [50, 100, 200, 300],
2780
+ 'subsample': [0.6, 0.8, 1.0],
2781
+ 'gamma': [0, 0.1, 0.2, 0.5],
2782
+ 'min_child_weight': [1, 5, 10],
2783
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2784
+ 'reg_lambda': [1, 1.5, 2],
2785
+ **{
2786
+ 'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
2787
+ }} if purpose== "classification"
2788
+ else{
2789
+ 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2790
+ 'max_depth': [3, 5, 7, 10],
2791
+ 'n_estimators': [50, 100, 200, 300],
2792
+ 'subsample': [0.6, 0.8, 1.0],
2793
+ 'colsample_bytree': [0.6, 0.8, 1.0],
2794
+ 'gamma': [0, 0.1, 0.2, 0.5],
2795
+ 'min_child_weight': [1, 5, 10],
2796
+ 'reg_alpha': [0, 0.1, 0.5, 1],
2797
+ 'reg_lambda': [1, 1.5, 2],
2798
+ **{
2799
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
2800
+ }},
2622
2801
  "KNN": (
2623
2802
  {
2624
2803
  "n_neighbors": [1, 3, 5, 10, 15, 20],
@@ -2731,6 +2910,14 @@ def predict(
2731
2910
  ], # If True, the regressors X will be normalized
2732
2911
  }
2733
2912
  ),
2913
+ "TheilSen":{'max_iter': [100, 200, 300],
2914
+ 'tol': [1e-4, 1e-3, 1e-2],
2915
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
2916
+ "Huber":{'epsilon': [1.35, 1.5, 2.0],
2917
+ 'alpha': [0.1, 1.0, 10.0],
2918
+ 'max_iter': [100, 200, 300],},
2919
+ "Poisson":{'alpha': [0.1, 1.0, 10.0],
2920
+ 'max_iter': [100, 200, 300],}
2734
2921
  }
2735
2922
  else: # median level
2736
2923
  param_grids = {
@@ -2790,12 +2977,30 @@ def predict(
2790
2977
  "subsample": [0.8, 1.0],
2791
2978
  },
2792
2979
  "XGBoost": {
2793
- "n_estimators": [100, 200, 500],
2794
- "max_depth": [3, 5, 7],
2795
- "learning_rate": [0.01, 0.1, 0.2],
2796
- "subsample": [0.8, 1.0],
2797
- "colsample_bytree": [0.8, 1.0],
2798
- },
2980
+ 'learning_rate': [0.01, 0.1],
2981
+ 'max_depth': [3, 5],
2982
+ 'n_estimators': [50, 100],
2983
+ 'subsample': [0.6, 0.8],
2984
+ 'gamma': [0, 0.1],
2985
+ 'min_child_weight': [1, 5],
2986
+ 'reg_alpha': [0, 0.1],
2987
+ 'reg_lambda': [1,],
2988
+ **{
2989
+ 'objective': ['binary:logistic', 'multi:softmax'],
2990
+ }} if purpose== "classification"
2991
+ else{
2992
+ 'learning_rate': [0.01, 0.1],
2993
+ 'max_depth': [3, 5,],
2994
+ 'n_estimators': [50, 100],
2995
+ 'subsample': [0.6, 0.8],
2996
+ 'colsample_bytree': [0.6, 0.8],
2997
+ 'gamma': [0, 0.1],
2998
+ 'min_child_weight': [1, 5],
2999
+ 'reg_alpha': [0, 0.1],
3000
+ 'reg_lambda': [1, 1.5],
3001
+ **{
3002
+ 'objective': ['reg:squarederror', 'reg:squaredlogerror'],
3003
+ }},
2799
3004
  "KNN": (
2800
3005
  {
2801
3006
  "n_neighbors": [3, 5, 7, 10],
@@ -2952,6 +3157,14 @@ def predict(
2952
3157
  ], # Solver for optimization
2953
3158
  }
2954
3159
  ),
3160
+ "TheilSen":{'max_iter': [100, 200],
3161
+ 'tol': [1e-4, 1e-3],
3162
+ 'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
3163
+ "Huber":{'epsilon': [1.35, 1.5],
3164
+ 'alpha': [0.1, 1.0],
3165
+ 'max_iter': [100, 200],},
3166
+ "Poisson":{'alpha': [0.1, 1.0],
3167
+ 'max_iter': [100, 200],}
2955
3168
  }
2956
3169
 
2957
3170
  results = {}
@@ -2971,83 +3184,124 @@ def predict(
2971
3184
  ):
2972
3185
  if verbose:
2973
3186
  print(f"\nTraining and validating {name}:")
2974
-
2975
- # Grid search with KFold or StratifiedKFold
2976
- if is_binary:
2977
- gs = GridSearchCV(
2978
- clf,
2979
- param_grid=param_grids.get(name, {}),
2980
- scoring=(
2981
- "roc_auc"
2982
- if purpose == "classification"
2983
- else "neg_mean_squared_error"
2984
- ),
2985
- cv=cv,
2986
- n_jobs=n_jobs,
2987
- verbose=verbose,
2988
- )
2989
-
2990
- gs.fit(x_train, y_train)
2991
- best_clf = gs.best_estimator_
2992
- # make sure x_train and x_test has the same name
2993
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2994
- y_pred = best_clf.predict(x_true)
2995
- if hasattr(best_clf, "predict_proba"):
2996
- y_pred_proba = best_clf.predict_proba(x_true)
2997
- print("Shape of predicted probabilities:", y_pred_proba.shape)
2998
- if y_pred_proba.shape[1] == 1:
2999
- y_pred_proba = np.hstack(
3000
- [1 - y_pred_proba, y_pred_proba]
3001
- ) # Add missing class probabilities
3002
- y_pred_proba = y_pred_proba[:, 1]
3003
- elif hasattr(best_clf, "decision_function"):
3004
- # If predict_proba is not available, use decision_function (e.g., for SVM)
3005
- y_pred_proba = best_clf.decision_function(x_true)
3006
- # Ensure y_pred_proba is within 0 and 1 bounds
3007
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3008
- y_pred_proba.max() - y_pred_proba.min()
3187
+ try:
3188
+ # Grid search with KFold or StratifiedKFold
3189
+ if is_binary:
3190
+ gs = GridSearchCV(
3191
+ clf,
3192
+ param_grid=param_grids.get(name, {}),
3193
+ scoring=(
3194
+ "roc_auc"
3195
+ if purpose == "classification"
3196
+ else "neg_mean_squared_error"
3197
+ ),
3198
+ cv=cv,
3199
+ n_jobs=n_jobs,
3200
+ verbose=verbose,
3009
3201
  )
3010
- else:
3011
- y_pred_proba = None # No probability output for certain models
3012
- else:
3013
- gs = GridSearchCV(
3014
- clf,
3015
- param_grid=param_grids.get(name, {}),
3016
- scoring=(
3017
- "roc_auc_ovr"
3018
- if purpose == "classification"
3019
- else "neg_mean_squared_error"
3020
- ),
3021
- cv=cv,
3022
- n_jobs=n_jobs,
3023
- verbose=verbose,
3024
- )
3025
-
3026
- # Fit GridSearchCV
3027
- gs.fit(x_train, y_train)
3028
- best_clf = gs.best_estimator_
3029
-
3030
- # Ensure x_true aligns with x_train columns
3031
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3032
- y_pred = best_clf.predict(x_true)
3033
-
3034
- # Handle prediction probabilities for multiclass
3035
- if hasattr(best_clf, "predict_proba"):
3036
- y_pred_proba = best_clf.predict_proba(x_true)
3037
- elif hasattr(best_clf, "decision_function"):
3038
- y_pred_proba = best_clf.decision_function(x_true)
3039
3202
 
3040
- # Normalize for multiclass if necessary
3041
- if y_pred_proba.ndim == 2:
3042
- y_pred_proba = (
3043
- y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3044
- ) / (
3045
- y_pred_proba.max(axis=1, keepdims=True)
3046
- - y_pred_proba.min(axis=1, keepdims=True)
3203
+ gs.fit(x_train, y_train)
3204
+ best_clf = gs.best_estimator_
3205
+ # make sure x_train and x_test has the same name
3206
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3207
+ y_pred = best_clf.predict(x_true)
3208
+ if hasattr(best_clf, "predict_proba"):
3209
+ y_pred_proba = best_clf.predict_proba(x_true)
3210
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
3211
+ if y_pred_proba.shape[1] == 1:
3212
+ y_pred_proba = np.hstack(
3213
+ [1 - y_pred_proba, y_pred_proba]
3214
+ ) # Add missing class probabilities
3215
+ y_pred_proba = y_pred_proba[:, 1]
3216
+ elif hasattr(best_clf, "decision_function"):
3217
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3218
+ y_pred_proba = best_clf.decision_function(x_true)
3219
+ # Ensure y_pred_proba is within 0 and 1 bounds
3220
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3221
+ y_pred_proba.max() - y_pred_proba.min()
3047
3222
  )
3223
+ else:
3224
+ y_pred_proba = None # No probability output for certain models
3225
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3226
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3227
+ if hasattr(best_clf, "alphas_"):
3228
+ alphas_ = best_clf.alphas_
3229
+ elif hasattr(best_clf, "alpha_"):
3230
+ alphas_ = best_clf.alpha_
3231
+ elif hasattr(best_clf, "Cs_"):
3232
+ alphas_ = best_clf.Cs_
3233
+ else:
3234
+ alphas_= None
3235
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3048
3236
  else:
3049
- y_pred_proba = None # No probability output for certain models
3237
+ gs = GridSearchCV(
3238
+ clf,
3239
+ param_grid=param_grids.get(name, {}),
3240
+ scoring=(
3241
+ "roc_auc_ovr"
3242
+ if purpose == "classification"
3243
+ else "neg_mean_squared_error"
3244
+ ),
3245
+ cv=cv,
3246
+ n_jobs=n_jobs,
3247
+ verbose=verbose,
3248
+ )
3050
3249
 
3250
+ # Fit GridSearchCV
3251
+ gs.fit(x_train, y_train)
3252
+ best_clf = gs.best_estimator_
3253
+
3254
+ # Ensure x_true aligns with x_train columns
3255
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3256
+
3257
+ # do i need to fit the x_train, y_train again?
3258
+ best_clf=best_clf.fit(x_train, y_train)
3259
+ y_pred = best_clf.predict(x_true)
3260
+
3261
+ # Handle prediction probabilities for multiclass
3262
+ if hasattr(best_clf, "predict_proba"):
3263
+ y_pred_proba = best_clf.predict_proba(x_true)
3264
+ elif hasattr(best_clf, "decision_function"):
3265
+ y_pred_proba = best_clf.decision_function(x_true)
3266
+
3267
+ # Normalize for multiclass if necessary
3268
+ if y_pred_proba.ndim == 2:
3269
+ y_pred_proba = (
3270
+ y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3271
+ ) / (
3272
+ y_pred_proba.max(axis=1, keepdims=True)
3273
+ - y_pred_proba.min(axis=1, keepdims=True)
3274
+ )
3275
+ else:
3276
+ y_pred_proba = None # No probability output for certain models
3277
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3278
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3279
+ if hasattr(best_clf, "alphas_"):
3280
+ alphas_ = best_clf.alphas_
3281
+ elif hasattr(best_clf, "alpha_"):
3282
+ alphas_ = best_clf.alpha_
3283
+ elif hasattr(best_clf, "Cs_"):
3284
+ alphas_ = best_clf.Cs_
3285
+ else:
3286
+ alphas_= None
3287
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3288
+ except Exception as e:
3289
+ alphas_,coef_ = None,None
3290
+ print(f"skiped {clf}: {e}")
3291
+ continue
3292
+ # try to make predict format consistant
3293
+ try:
3294
+ y_pred= [i[0] for i in y_pred]
3295
+ except:
3296
+ pass
3297
+ try:
3298
+ y_true= [i[0] for i in y_true]
3299
+ except:
3300
+ pass
3301
+ try:
3302
+ y_train= [i[0] for i in y_train]
3303
+ except:
3304
+ pass
3051
3305
  validation_scores = {}
3052
3306
 
3053
3307
  if y_true is not None and y_pred_proba is not None:
@@ -3097,20 +3351,26 @@ def predict(
3097
3351
  "roc_curve": roc_info,
3098
3352
  "pr_curve": pr_info,
3099
3353
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3100
- "predictions": y_pred.tolist(),
3354
+ "predictions": y_pred,#.tolist(),
3101
3355
  "predictions_proba": (
3102
3356
  y_pred_proba.tolist() if y_pred_proba is not None else None
3103
3357
  ),
3358
+ "features":share_col_names,
3359
+ "coef":coef_,
3360
+ "alphas":alphas_
3104
3361
  }
3105
3362
  else: # "regression"
3106
3363
  results[name] = {
3107
3364
  "best_clf": gs.best_estimator_,
3108
3365
  "best_params": gs.best_params_,
3109
3366
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3110
- "predictions": y_pred.tolist(),
3367
+ "predictions": y_pred,#.tolist(),
3111
3368
  "predictions_proba": (
3112
3369
  y_pred_proba.tolist() if y_pred_proba is not None else None
3113
3370
  ),
3371
+ "features":share_col_names,
3372
+ "coef":coef_,
3373
+ "alphas":alphas_
3114
3374
  }
3115
3375
  else: # multi-classes
3116
3376
  if y_pred_proba is not None:
@@ -3149,20 +3409,26 @@ def predict(
3149
3409
  "roc_curve": roc_info,
3150
3410
  "pr_curve": pr_info,
3151
3411
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3152
- "predictions": y_pred.tolist(),
3412
+ "predictions": y_pred,#.tolist(),
3153
3413
  "predictions_proba": (
3154
3414
  y_pred_proba.tolist() if y_pred_proba is not None else None
3155
3415
  ),
3416
+ "features":share_col_names,
3417
+ "coef":coef_,
3418
+ "alphas":alphas_
3156
3419
  }
3157
3420
  else: # "regression"
3158
3421
  results[name] = {
3159
3422
  "best_clf": gs.best_estimator_,
3160
3423
  "best_params": gs.best_params_,
3161
3424
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3162
- "predictions": y_pred.tolist(),
3425
+ "predictions": y_pred,#.tolist(),
3163
3426
  "predictions_proba": (
3164
3427
  y_pred_proba.tolist() if y_pred_proba is not None else None
3165
3428
  ),
3429
+ "features":share_col_names,
3430
+ "coef":coef_,
3431
+ "alphas":alphas_
3166
3432
  }
3167
3433
 
3168
3434
  else:
@@ -3181,23 +3447,32 @@ def predict(
3181
3447
  "best_clf": gs.best_estimator_,
3182
3448
  "best_params": gs.best_params_,
3183
3449
  "scores": validation_scores,
3184
- "predictions": y_pred.tolist(),
3450
+ "predictions": y_pred,#.tolist(),
3185
3451
  "predictions_proba": (
3186
3452
  y_pred_proba.tolist() if y_pred_proba is not None else None
3187
3453
  ),
3454
+ "features":share_col_names,
3188
3455
  "y_train": y_train if y_train is not None else [],
3189
3456
  "y_true": y_true if y_true is not None else [],
3457
+ "coef":coef_,
3458
+ "alphas":alphas_
3190
3459
  }
3191
3460
 
3192
3461
  # Convert results to DataFrame
3193
3462
  df_results = pd.DataFrame.from_dict(results, orient="index")
3194
3463
  # sort
3195
- if y_true is not None and purpose == "classification":
3196
- df_scores = pd.DataFrame(
3197
- df_results["scores"].tolist(), index=df_results["scores"].index
3198
- ).sort_values(by="roc_auc", ascending=False)
3464
+ if y_true is not None:
3465
+ if purpose == "classification":
3466
+ df_scores = pd.DataFrame(
3467
+ df_results["scores"].tolist(), index=df_results["scores"].index
3468
+ ).sort_values(by="roc_auc", ascending=False)
3469
+ elif purpose=='regression':
3470
+ df_scores = rank_models_reg(
3471
+ pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
3472
+ ascending=False)
3199
3473
  df_results = df_results.loc[df_scores.index]
3200
3474
 
3475
+ if y_true is not None and purpose == "classification":
3201
3476
  if plot_:
3202
3477
  from datetime import datetime
3203
3478
 
@@ -3215,18 +3490,565 @@ def predict(
3215
3490
  plot.figsets(xangle=30)
3216
3491
  if dir_save:
3217
3492
  ips.figsave(dir_save + f"scores_clus{now_}.pdf")
3493
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3494
+ # # try:
3495
+ # if len(models) > 3:
3496
+ # plot_validate_features(df_results, is_binary=is_binary)
3497
+ # else:
3498
+ # plot_validate_features_single(df_results, is_binary=is_binary)
3499
+ # if dir_save:
3500
+ # ips.figsave(dir_save + f"validate_features{now_}.pdf")
3501
+ # # except Exception as e:
3502
+ # # print(f"Error: 在画图的过程中出现了问题:{e}")
3503
+ if stack:
3504
+ #! stacking classifier/regressor
3505
+ from sklearn.metrics import make_scorer, accuracy_score
3506
+ from sklearn.model_selection import cross_val_score
3507
+
3508
+ #* n_top_models防止超过index
3509
+ n_top_models = min(n_top_models, df_results.shape[0])
3510
+
3511
+ #* 选择出排名靠前的n个, estimators
3512
+ models_selecte = select_top_models(models=list(df_results.index),
3513
+ categories=models_support[purpose],
3514
+ n_top_models=n_top_models,
3515
+ n_models_per_category=n_models_per_category)
3516
+ top_models = df_results.loc[models_selecte]["best_clf"]
3517
+ base_estimators = []
3518
+ for i, j in top_models.to_dict().items():
3519
+ base_estimators.append((i, j))
3520
+ if stacking_cv:
3521
+ print(f"⤵ stacking_cv is processing...")
3522
+ #* 定义几个象征性的final_estimator
3523
+ # 备选的几种
3524
+ if purpose == "classification":
3525
+ kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
3526
+ else:
3527
+ kadt_estimators=["XGBoost","LassoCV"]
3528
+ final_estimators={}
3529
+ for name in kadt_estimators:
3530
+ param_grid=param_grids.get(name, {})
3531
+ print(param_grid)
3532
+ if is_binary:
3533
+ gs = GridSearchCV(
3534
+ model_[name],
3535
+ param_grid=param_grid,
3536
+ scoring=(
3537
+ "roc_auc"
3538
+ if purpose == "classification"
3539
+ else "neg_mean_squared_error"
3540
+ ),
3541
+ cv=cv,
3542
+ n_jobs=n_jobs,
3543
+ verbose=verbose,
3544
+ )
3545
+ else:
3546
+ gs = GridSearchCV(
3547
+ model_[name],
3548
+ param_grid=param_grid,
3549
+ scoring=(
3550
+ "roc_auc_ovr"
3551
+ if purpose == "classification"
3552
+ else "neg_mean_squared_error"
3553
+ ),
3554
+ cv=cv,
3555
+ n_jobs=n_jobs,
3556
+ verbose=verbose,
3557
+ )
3558
+ # Fit GridSearchCV
3559
+ gs.fit(x_train, y_train)
3560
+ final_estimators[name]=gs.best_estimator_
3561
+
3562
+ #* Set up cross-validation and performance evaluation
3563
+ scorer = make_scorer(accuracy_score)
3564
+ cv_results = []
3565
+
3566
+ #*Cross-validate stacking models with different final estimators
3567
+ for final_name, final_estimator in final_estimators.items():
3568
+ print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
3569
+ if purpose == "classification":
3570
+ stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
3571
+ else:
3572
+ stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
3573
+
3574
+ scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
3575
+
3576
+ # Store the result
3577
+ cv_results.append({
3578
+ 'final_estimator':final_estimator,
3579
+ 'Final Estimator': final_name,
3580
+ 'Mean Accuracy': np.mean(scores),
3581
+ 'Standard Deviation': np.std(scores)
3582
+ })
3583
+
3584
+ #* Convert the results into a DataFrame for easy comparison
3585
+ cv_results_df = pd.DataFrame(cv_results)
3586
+
3587
+ #* Sort and display the best model
3588
+ cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
3589
+
3590
+
3591
+ # Optionally: Select the final estimator that gives the best performance
3592
+ best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3593
+ print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3594
+ else:
3595
+ print(f"⤵ trying to find the best_final_estimator for stacking...")
3596
+ if purpose=="classification":
3597
+ best_final_estimator = LogisticRegression(class_weight=class_weight,
3598
+ random_state=random_state,
3599
+ max_iter=1000)
3600
+ else:
3601
+ best_final_estimator = RidgeCV(cv=5)
3602
+ print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3603
+ #! apply stacking
3604
+ if purpose == "classification":
3605
+ print(f"⤵ StackingClassifier...")
3606
+ stacking_model = StackingClassifier(estimators=base_estimators,
3607
+ final_estimator=best_final_estimator,
3608
+ cv=cv)
3609
+ else:
3610
+ print(f"⤵ StackingRegressor...")
3611
+ stacking_model = StackingRegressor(estimators=base_estimators,
3612
+ final_estimator=best_final_estimator,
3613
+ cv=cv)
3614
+
3615
+ # Train the Stacking Classifier
3616
+ print(f"⤵ fit & predict...")
3617
+ stacking_model.fit(x_train, y_train)
3618
+ y_pred_final = stacking_model.predict(x_true)
3619
+ print(f"⤵ collecting results...")
3620
+ # pred_proba
3621
+ if is_binary:
3622
+ if hasattr(stacking_model, "predict_proba"):
3623
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3624
+ if y_pred_proba_final.shape[1] == 1:
3625
+ y_pred_proba_final = np.hstack(
3626
+ [1 - y_pred_proba_final, y_pred_proba_final]
3627
+ ) # Add missing class probabilities
3628
+ y_pred_proba_final = y_pred_proba_final[:, 1]
3629
+ elif hasattr(stacking_model, "decision_function"):
3630
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3631
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3632
+ # Ensure y_pred_proba_final is within 0 and 1 bounds
3633
+ y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
3634
+ y_pred_proba_final.max() - y_pred_proba_final.min()
3635
+ )
3636
+ else:
3637
+ y_pred_proba_final = None # No probability output for certain models
3638
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3639
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3640
+ if hasattr(best_clf, "alphas_"):
3641
+ alphas_ = best_clf.alphas_
3642
+ elif hasattr(best_clf, "alpha_"):
3643
+ alphas_ = best_clf.alpha_
3644
+ elif hasattr(best_clf, "Cs_"):
3645
+ alphas_ = best_clf.Cs_
3646
+ else:
3647
+ alphas_= None
3648
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3649
+ if not is_binary:
3650
+ # Handle prediction probabilities for multiclass
3651
+ if hasattr(stacking_model, "predict_proba"):
3652
+ y_pred_proba_final = stacking_model.predict_proba(x_true)
3653
+ elif hasattr(stacking_model, "decision_function"):
3654
+ y_pred_proba_final = stacking_model.decision_function(x_true)
3655
+
3656
+ # Normalize for multiclass if necessary
3657
+ if y_pred_proba_final.ndim == 2:
3658
+ y_pred_proba_final = (
3659
+ y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
3660
+ ) / (
3661
+ y_pred_proba_final.max(axis=1, keepdims=True)
3662
+ - y_pred_proba_final.min(axis=1, keepdims=True)
3663
+ )
3664
+ else:
3665
+ y_pred_proba_final = None # No probability output for certain models
3666
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3667
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3668
+ if hasattr(best_clf, "alphas_"):
3669
+ alphas_ = best_clf.alphas_
3670
+ elif hasattr(best_clf, "alpha_"):
3671
+ alphas_ = best_clf.alpha_
3672
+ elif hasattr(best_clf, "Cs_"):
3673
+ alphas_ = best_clf.Cs_
3674
+ else:
3675
+ alphas_= None
3676
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3677
+ #! dict_pred_stack
3678
+ dict_pred_stack={}
3679
+ validation_scores_final = {}
3680
+ if y_true is not None and y_pred_proba_final is not None:
3681
+ validation_scores_final = cal_metrics(
3682
+ y_true,
3683
+ y_pred_final,
3684
+ y_pred_proba=y_pred_proba_final,
3685
+ is_binary=is_binary,
3686
+ purpose=purpose,
3687
+ average="weighted",
3688
+ )
3689
+ if is_binary:
3690
+ # Calculate ROC curve
3691
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
3692
+ if y_pred_proba_final is not None:
3693
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3694
+ lower_ci, upper_ci = cal_auc_ci(
3695
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3696
+ )
3697
+ roc_auc = auc(fpr, tpr)
3698
+ roc_info = {
3699
+ "fpr": fpr.tolist(),
3700
+ "tpr": tpr.tolist(),
3701
+ "auc": roc_auc,
3702
+ "ci95": (lower_ci, upper_ci),
3703
+ }
3704
+ # precision-recall curve
3705
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
3706
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
3707
+ pr_info = {
3708
+ "precision": precision_,
3709
+ "recall": recall_,
3710
+ "avg_precision": avg_precision_,
3711
+ }
3712
+ else:
3713
+ roc_info, pr_info = None, None
3714
+ if purpose == "classification":
3715
+ dict_pred_stack = {
3716
+ "best_clf": stacking_model,
3717
+ "best_params": None,
3718
+ "auc_indiv": None,
3719
+ "scores": validation_scores_final,
3720
+ "roc_curve": roc_info,
3721
+ "pr_curve": pr_info,
3722
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3723
+ "predictions": y_pred_final.tolist(),
3724
+ "predictions_proba": (
3725
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3726
+ ),
3727
+ "features":share_col_names,
3728
+ "coef":coef_,
3729
+ "alphas":alphas_
3730
+ }
3731
+ else: # "regression"
3732
+ dict_pred_stack = {
3733
+ "best_clf": stacking_model,
3734
+ "best_params": None,
3735
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3736
+ "predictions": y_pred_final.tolist(),
3737
+ "predictions_proba": (
3738
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3739
+ ),
3740
+ "features":share_col_names,
3741
+ "coef":coef_,
3742
+ "alphas":alphas_
3743
+ }
3744
+ else: # multi-classes
3745
+ if y_pred_proba_final is not None:
3746
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
3747
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
3748
+ confidence_intervals = cal_auc_ci(
3749
+ y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
3750
+ )
3751
+ roc_info = {
3752
+ "fpr": validation_scores_final["fpr"],
3753
+ "tpr": validation_scores_final["tpr"],
3754
+ "auc": validation_scores_final["roc_auc_by_class"],
3755
+ "ci95": confidence_intervals,
3756
+ }
3757
+ # precision-recall curve
3758
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3759
+ y_true, y_pred_proba_final, is_binary=is_binary
3760
+ )
3761
+ pr_info = {
3762
+ "precision": precision_,
3763
+ "recall": recall_,
3764
+ "avg_precision": avg_precision_,
3765
+ }
3766
+ else:
3767
+ roc_info, pr_info = None, None
3768
+
3769
+ if purpose == "classification":
3770
+ dict_pred_stack = {
3771
+ "best_clf": stacking_model,
3772
+ "best_params": None,
3773
+ "auc_indiv": None,
3774
+ "scores": validation_scores_final,
3775
+ "roc_curve": roc_info,
3776
+ "pr_curve": pr_info,
3777
+ "confusion_matrix": confusion_matrix(y_true, y_pred_final),
3778
+ "predictions": y_pred_final.tolist(),
3779
+ "predictions_proba": (
3780
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3781
+ ),
3782
+ "features":share_col_names,
3783
+ "coef":coef_,
3784
+ "alphas":alphas_
3785
+ }
3786
+ else: # "regression"
3787
+ dict_pred_stack = {
3788
+ "best_clf": stacking_model,
3789
+ "best_params": None,
3790
+ "scores": validation_scores_final, # e.g., neg_MSE, R², etc.
3791
+ "predictions": y_pred_final.tolist(),
3792
+ "predictions_proba": (
3793
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3794
+ ),
3795
+ "features":share_col_names,
3796
+ "coef":coef_,
3797
+ "alphas":alphas_
3798
+ }
3799
+
3800
+ else:
3801
+ if y_true is None:
3802
+ validation_scores_final = []
3803
+ else:
3804
+ validation_scores_final = cal_metrics(
3805
+ y_true,
3806
+ y_pred,
3807
+ y_pred_proba=y_pred_proba_final,
3808
+ is_binary=is_binary,
3809
+ purpose=purpose,
3810
+ average="weighted",
3811
+ )
3812
+ dict_pred_stack = {
3813
+ "best_clf": stacking_model,
3814
+ "best_params": None,
3815
+ "scores": validation_scores_final,
3816
+ "predictions": y_pred_final.tolist(),
3817
+ "predictions_proba": (
3818
+ y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3819
+ ),
3820
+ "features":share_col_names,
3821
+ "y_train": y_train if y_train is not None else [],
3822
+ "y_true": y_true if y_true is not None else [],
3823
+ "coef":coef_,
3824
+ "alphas":alphas_
3825
+ }
3826
+ # merge together
3827
+ df_pred = pd.DataFrame(
3828
+ [None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
3829
+ for k, v in dict_pred_stack.items():
3830
+ if k in df_pred.columns:
3831
+ df_pred[k] = [v]
3832
+
3833
+ # # plot the stacking
3834
+ # if all([plot_, y_true is not None, purpose == "classification"]):
3835
+ # plot_validate_features_single(df_pred, is_binary=is_binary)
3836
+ # if dir_save:
3837
+ # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3838
+ if vote:
3839
+ print(f"⤵ voting...")
3840
+ from sklearn.ensemble import VotingClassifier, VotingRegressor
3841
+ #! voting
3842
+ n_top_models = min(n_top_models, df_results.shape[0])
3843
+ base_estimators=[]
3844
+ for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3845
+ base_estimators.append((name,cls))
3846
+ # Apply Voting Classifier/Regressor
3847
+ if purpose == "classification":
3848
+ print(f"⤵ VotingClassifier...via{voting}")
3849
+ if voting=='hard':
3850
+ # Hard voting does not support `predict_proba`
3851
+ voting_model = VotingClassifier(estimators=base_estimators)
3852
+ else:
3853
+ # Soft voting supports `predict_proba`
3854
+ voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3855
+ else:
3856
+ print(f"⤵ VotingRegressor...")
3857
+ voting_model = VotingRegressor(estimators=base_estimators)
3858
+
3859
+ # Train the Voting Classifier/Regressor
3860
+ try:
3861
+ voting_model.fit(x_train, y_train)
3862
+ y_pred_vote = voting_model.predict(x_true)
3863
+ except Exception as e:
3864
+ if purpose == "classification" and not voting=='hard':
3865
+ voting_model = VotingClassifier(estimators=base_estimators)
3866
+ voting_model.fit(x_train, y_train)
3867
+ y_pred_vote = voting_model.predict(x_true)
3868
+
3869
+ # Calculate predicted probabilities if applicable
3870
+ if purpose == "classification":
3871
+ if hasattr(voting_model, "predict_proba"):
3872
+ y_pred_proba_vote = voting_model.predict_proba(x_true)
3873
+ print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
3874
+ if y_pred_proba_vote.shape[1] == 1:
3875
+ y_pred_proba_vote = np.hstack(
3876
+ [1 - y_pred_proba_vote, y_pred_proba_vote]
3877
+ ) # Add missing class probabilities
3878
+ y_pred_proba_vote = y_pred_proba_vote[:, 1]
3879
+ else:
3880
+ y_pred_proba_vote = None
3881
+
3882
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3883
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3884
+ if hasattr(best_clf, "alphas_"):
3885
+ alphas_ = best_clf.alphas_
3886
+ elif hasattr(best_clf, "alpha_"):
3887
+ alphas_ = best_clf.alpha_
3888
+ elif hasattr(best_clf, "Cs_"):
3889
+ alphas_ = best_clf.Cs_
3890
+ else:
3891
+ alphas_= None
3892
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3893
+ else: # Regression
3894
+ y_pred_proba_vote = None
3895
+ coef_,alphas_=None,None
3896
+
3897
+ print(f"⤵ collecting voting results...")
3898
+ #! dict_pred_vote
3899
+ dict_pred_vote = {}
3900
+ validation_scores_vote = {}
3901
+ if y_true is not None and y_pred_proba_vote is not None:
3902
+ validation_scores_vote = cal_metrics(
3903
+ y_true,
3904
+ y_pred_vote,
3905
+ y_pred_proba=y_pred_proba_vote,
3906
+ is_binary=is_binary,
3907
+ purpose=purpose,
3908
+ average="weighted",
3909
+ )
3910
+
3911
+ if is_binary:
3912
+ if y_pred_proba_vote is not None:
3913
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
3914
+ lower_ci, upper_ci = cal_auc_ci(
3915
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3916
+ )
3917
+ roc_auc = auc(fpr, tpr)
3918
+ roc_info = {
3919
+ "fpr": fpr.tolist(),
3920
+ "tpr": tpr.tolist(),
3921
+ "auc": roc_auc,
3922
+ "ci95": (lower_ci, upper_ci),
3923
+ }
3924
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
3925
+ avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
3926
+ pr_info = {
3927
+ "precision": precision_,
3928
+ "recall": recall_,
3929
+ "avg_precision": avg_precision_,
3930
+ }
3931
+ else:
3932
+ roc_info, pr_info = None, None
3933
+
3934
+ dict_pred_vote = {
3935
+ "best_clf": voting_model,
3936
+ "best_params": None,
3937
+ "auc_indiv": None,
3938
+ "scores": validation_scores_vote,
3939
+ "roc_curve": roc_info,
3940
+ "pr_curve": pr_info,
3941
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3942
+ "predictions": y_pred_vote.tolist(),
3943
+ "predictions_proba": (
3944
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3945
+ ),
3946
+ "features":share_col_names,
3947
+ "coef":coef_,
3948
+ "alphas":alphas_
3949
+ }
3950
+ else: # Multi-class
3951
+ if y_pred_proba_vote is not None:
3952
+ confidence_intervals = cal_auc_ci(
3953
+ y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
3954
+ )
3955
+ roc_info = {
3956
+ "fpr": validation_scores_vote["fpr"],
3957
+ "tpr": validation_scores_vote["tpr"],
3958
+ "auc": validation_scores_vote["roc_auc_by_class"],
3959
+ "ci95": confidence_intervals,
3960
+ }
3961
+ precision_, recall_, avg_precision_ = cal_precision_recall(
3962
+ y_true, y_pred_proba_vote, is_binary=is_binary
3963
+ )
3964
+ pr_info = {
3965
+ "precision": precision_,
3966
+ "recall": recall_,
3967
+ "avg_precision": avg_precision_,
3968
+ }
3969
+ else:
3970
+ roc_info, pr_info = None, None
3971
+
3972
+ dict_pred_vote = {
3973
+ "best_clf": voting_model,
3974
+ "best_params": None,
3975
+ "scores": validation_scores_vote,
3976
+ "roc_curve": roc_info,
3977
+ "pr_curve": pr_info,
3978
+ "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
3979
+ "predictions": y_pred_vote.tolist(),
3980
+ "predictions_proba": (
3981
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3982
+ ),
3983
+ "features":share_col_names,
3984
+ "coef":coef_,
3985
+ "alphas":alphas_
3986
+ }
3987
+ else:
3988
+ if y_true is None:
3989
+ validation_scores_vote = []
3990
+ else:
3991
+ validation_scores_vote = cal_metrics(
3992
+ y_true,
3993
+ y_pred,
3994
+ y_pred_proba=y_pred_proba_vote,
3995
+ is_binary=is_binary,
3996
+ purpose=purpose,
3997
+ average="weighted",
3998
+ )
3999
+ dict_pred_vote = {
4000
+ "best_clf": voting_model,
4001
+ "best_params": None,
4002
+ "scores": validation_scores_vote,
4003
+ "predictions": y_pred_vote.tolist(),
4004
+ "predictions_proba": (
4005
+ y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
4006
+ ),
4007
+ "features":share_col_names,
4008
+ "y_train": y_train if y_train is not None else [],
4009
+ "y_true": y_true if y_true is not None else [],
4010
+ }
4011
+ df_vote = pd.DataFrame(
4012
+ [None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
4013
+ for k, v in dict_pred_vote.items():
4014
+ if k in df_vote.columns:
4015
+ df_vote[k] = [v]
4016
+
4017
+ # if all([plot_, y_true is not None, purpose == "classification"]):
4018
+ # try:
4019
+ # plot_validate_features_single(df_vote, is_binary=is_binary)
4020
+ # if dir_save:
4021
+ # ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
4022
+ # except Exception as e:
4023
+ # print(e)
4024
+ print("Done")
4025
+ if vote and stack:
4026
+ df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
4027
+ elif vote:
4028
+ df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
4029
+ elif stack:
4030
+ df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
4031
+
3218
4032
  if all([plot_, y_true is not None, purpose == "classification"]):
4033
+ from datetime import datetime
4034
+
4035
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3219
4036
  # try:
3220
- if len(models) > 3:
3221
- plot_validate_features(df_results, is_binary=is_binary)
4037
+ if df_res.shape[0] > 3:
4038
+ try:
4039
+ plot_validate_features(df_res, is_binary=is_binary)
4040
+ except Exception as e:
4041
+ print(e)
3222
4042
  else:
3223
- plot_validate_features_single(df_results, is_binary=is_binary)
4043
+ try:
4044
+ plot_validate_features_single(df_res, is_binary=is_binary)
4045
+ except Exception as e:
4046
+ print(e)
3224
4047
  if dir_save:
3225
4048
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3226
- # except Exception as e:
3227
- # print(f"Error: 在画图的过程中出现了问题:{e}")
3228
- return df_results
3229
-
4049
+ # except Exception as e:
4050
+ # print(f"Error: 在画图的过程中出现了问题:{e}")
4051
+ return df_res
3230
4052
 
3231
4053
  def cal_metrics(
3232
4054
  y_true,
@@ -3368,7 +4190,7 @@ def cal_metrics(
3368
4190
 
3369
4191
 
3370
4192
  def plot_trees(
3371
- X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
4193
+ X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3372
4194
  ):
3373
4195
  """
3374
4196
  # # Example usage:
@@ -3414,10 +4236,14 @@ def plot_trees(
3414
4236
  train_error_rate = []
3415
4237
  test_error_rate = []
3416
4238
  validation_error = None
3417
-
4239
+ if isinstance(cls, str):
4240
+ cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
3418
4241
  # Configure classifier based on type
3419
4242
  oob_enabled = False # Default to no OOB error unless explicitly set
3420
-
4243
+ clf_support = {"RandomForestClassifier":RandomForestClassifier(),
4244
+ "ExtraTreesClassifier":ExtraTreesClassifier(),
4245
+ "AdaBoostClassifier":AdaBoostClassifier(),
4246
+ "GradientBoostingClassifier":GradientBoostingClassifier()}
3421
4247
  if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3422
4248
  # Enable OOB if cls supports it and is using bootstrapping
3423
4249
  cls.set_params(warm_start=True, n_estimators=1)
@@ -3679,7 +4505,7 @@ def img_datasets_preprocessing(
3679
4505
 
3680
4506
 
3681
4507
  def backward_regression(
3682
- X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
4508
+ X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
3683
4509
  ):
3684
4510
  """
3685
4511
  # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
@@ -3691,31 +4517,46 @@ def backward_regression(
3691
4517
  X -- features values
3692
4518
  y -- target variable
3693
4519
  initial_list -- features header
3694
- threshold_out -- pvalue threshold of features to drop
4520
+ thr -- pvalue threshold of features to drop
3695
4521
  verbose -- true to produce lots of logging output
3696
4522
 
3697
4523
  Returns:
3698
4524
  list of selected features for modeling
3699
4525
  """
3700
4526
  import statsmodels.api as sm
3701
-
3702
- if isinstance(y, str) and y in X.columns:
3703
- y_col_name = y
3704
- y = X[y]
3705
- X = X.drop(y_col_name, axis=1)
4527
+ if isinstance(y, str):
4528
+ if y in X.columns:
4529
+ y_col_name = y
4530
+ y = X[y]
4531
+ X = X.drop(y_col_name, axis=1)
4532
+ else:
4533
+ raise ValueError(f"找不到{y},y设置有误")
4534
+ X = X.select_dtypes(include=[np.number])
4535
+
3706
4536
  included = list(X.columns)
4537
+ try:
4538
+ X=X.astype(float)
4539
+ y=y.astype(float)
4540
+ except Exception as e:
4541
+ raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
4542
+
4543
+
3707
4544
  while True:
3708
4545
  changed = False
4546
+ if not included:
4547
+ print("No features remain in the model.")
4548
+ break
4549
+
3709
4550
  model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3710
4551
  # exclude the intercept for p-value checking
3711
4552
  pvalues = model.pvalues.iloc[1:]
3712
4553
  worst_pval = pvalues.max()
3713
- if worst_pval > threshold_out:
4554
+ if worst_pval > thr:
3714
4555
  changed = True
3715
4556
  worst_feature = pvalues.idxmax()
3716
4557
  included.remove(worst_feature)
3717
4558
  if verbose:
3718
- print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
4559
+ print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
3719
4560
  if not changed:
3720
4561
  break
3721
4562
  print(f"\nSelected Features:\n{included}")