py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
31
31
  average_precision_score,
32
32
  )
33
33
  from typing import Dict, Any, Optional, List, Union
34
+ import os, json
34
35
  import numpy as np
35
36
  import pandas as pd
36
37
  from . import ips
@@ -49,7 +50,13 @@ logger = logging.getLogger()
49
50
  warnings.filterwarnings("ignore", category=UserWarning)
50
51
  from sklearn.tree import DecisionTreeClassifier
51
52
  from sklearn.neighbors import KNeighborsClassifier
52
-
53
+ #* set random_state global
54
+ import torch
55
+ import random
56
+ random_state=1
57
+ random.seed(random_state)
58
+ np.random.seed(random_state)
59
+ torch.manual_seed(random_state)
53
60
 
54
61
  def features_knn(
55
62
  x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
@@ -594,7 +601,7 @@ def get_features(
594
601
  """
595
602
  from sklearn.compose import ColumnTransformer
596
603
  from sklearn.preprocessing import StandardScaler, OneHotEncoder
597
-
604
+ from sklearn.model_selection import train_test_split
598
605
  # Ensure X and y are DataFrames/Series for consistency
599
606
  if isinstance(X, np.ndarray):
600
607
  X = pd.DataFrame(X)
@@ -922,10 +929,26 @@ def get_features(
922
929
  "feature_importances": feature_importances,
923
930
  }
924
931
  if all([plot_, dir_save]):
932
+
925
933
  from datetime import datetime
926
-
927
934
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
928
935
  ips.figsave(dir_save + f"features{now_}.pdf")
936
+
937
+ lists = []
938
+ for tp in ips.flatten(features_df["type"]):
939
+ lists.append(
940
+ features_df
941
+ .loc[features_df["type"] == tp, "feature"]
942
+ .tolist()
943
+ )
944
+ labels = ips.flatten(features_df["type"])
945
+ # current_fig = plt.gcf()
946
+ # # ax = current_fig.add_subplot(3, 2, 6)
947
+ # gs = current_fig.add_gridspec(3, 2)
948
+ # ax = current_fig.add_subplot(gs[:, :])
949
+ plt.figure(figsize=[6,6])
950
+ plot.venn(lists, labels, cmap="coolwarm")
951
+ ips.figsave(dir_save + f"features{now_}shared_features.pdf")
929
952
  else:
930
953
  results = {
931
954
  "selected_features": pd.DataFrame(),
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1247
1270
  nexttile = plot.subplot(figsize=figsize)
1248
1271
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1249
1272
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1250
- fpr = res_val["roc_curve"][model_name]["fpr"]
1251
- tpr = res_val["roc_curve"][model_name]["tpr"]
1252
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1253
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1254
- plot_roc_curve(
1255
- fpr,
1256
- tpr,
1257
- mean_auc,
1258
- lower_ci,
1259
- upper_ci,
1260
- model_name=model_name,
1261
- lw=1.5,
1262
- color=colors[i],
1263
- alpha=alpha,
1264
- ax=ax,
1265
- )
1273
+ try:
1274
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1275
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1276
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1277
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1278
+ plot_roc_curve(
1279
+ fpr,
1280
+ tpr,
1281
+ mean_auc,
1282
+ lower_ci,
1283
+ upper_ci,
1284
+ model_name=model_name,
1285
+ lw=1.5,
1286
+ color=colors[i],
1287
+ alpha=alpha,
1288
+ ax=ax,
1289
+ )
1290
+ except Exception as e:
1291
+ print(e)
1266
1292
  plot.figsets(
1267
1293
  sp=2,
1268
1294
  legend=dict(
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1277
1303
 
1278
1304
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1279
1305
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1280
- plot_pr_curve(
1281
- recall=res_val["pr_curve"][model_name]["recall"],
1282
- precision=res_val["pr_curve"][model_name]["precision"],
1283
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1284
- model_name=model_name,
1285
- color=colors[i],
1286
- lw=1.5,
1287
- alpha=alpha,
1288
- ax=ax,
1289
- )
1306
+ try:
1307
+ plot_pr_curve(
1308
+ recall=res_val["pr_curve"][model_name]["recall"],
1309
+ precision=res_val["pr_curve"][model_name]["precision"],
1310
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1311
+ model_name=model_name,
1312
+ color=colors[i],
1313
+ lw=1.5,
1314
+ alpha=alpha,
1315
+ ax=ax,
1316
+ )
1317
+ except Exception as e:
1318
+ print(e)
1290
1319
  plot.figsets(
1291
1320
  sp=2,
1292
1321
  legend=dict(
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1314
1343
  for iclass, class_ in enumerate(classes):
1315
1344
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1316
1345
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1317
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1318
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1319
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1320
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1321
- plot_roc_curve(
1322
- fpr,
1323
- tpr,
1324
- mean_auc,
1325
- lower_ci,
1326
- upper_ci,
1327
- model_name=model_name,
1328
- lw=1.5,
1329
- color=colors[i],
1330
- alpha=alpha,
1331
- ax=ax,
1332
- )
1346
+ try:
1347
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1348
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1349
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1350
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1351
+ plot_roc_curve(
1352
+ fpr,
1353
+ tpr,
1354
+ mean_auc,
1355
+ lower_ci,
1356
+ upper_ci,
1357
+ model_name=model_name,
1358
+ lw=1.5,
1359
+ color=colors[i],
1360
+ alpha=alpha,
1361
+ ax=ax,
1362
+ )
1363
+ except Exception as e:
1364
+ print(e)
1333
1365
  plot.figsets(
1334
1366
  sp=2,
1335
1367
  title=class_,
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1345
1377
 
1346
1378
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1347
1379
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1348
- plot_pr_curve(
1349
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1350
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1351
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1352
- iclass
1353
- ],
1354
- model_name=model_name,
1355
- color=colors[i],
1356
- lw=1.5,
1357
- alpha=alpha,
1358
- ax=ax,
1359
- )
1380
+ try:
1381
+ plot_pr_curve(
1382
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1383
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1384
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1385
+ iclass
1386
+ ],
1387
+ model_name=model_name,
1388
+ color=colors[i],
1389
+ lw=1.5,
1390
+ alpha=alpha,
1391
+ ax=ax,
1392
+ )
1393
+ except Exception as e:
1394
+ print(e)
1360
1395
  plot.figsets(
1361
1396
  sp=2,
1362
1397
  title=class_,
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1379
1414
  len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
1380
1415
  )
1381
1416
  for model_name in ips.flatten(res_val["pr_curve"].index):
1382
- fpr = res_val["roc_curve"][model_name]["fpr"]
1383
- tpr = res_val["roc_curve"][model_name]["tpr"]
1384
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1385
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1386
-
1387
- # Plotting
1388
- plot_roc_curve(
1389
- fpr,
1390
- tpr,
1391
- mean_auc,
1392
- lower_ci,
1393
- upper_ci,
1394
- model_name=model_name,
1395
- ax=nexttile(),
1396
- )
1397
- plot.figsets(title=model_name, sp=2)
1417
+ try:
1418
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1419
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1420
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1421
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1398
1422
 
1399
- plot_pr_binary(
1400
- recall=res_val["pr_curve"][model_name]["recall"],
1401
- precision=res_val["pr_curve"][model_name]["precision"],
1402
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1403
- model_name=model_name,
1404
- ax=nexttile(),
1405
- )
1406
- plot.figsets(title=model_name, sp=2)
1423
+ # Plotting
1424
+ plot_roc_curve(
1425
+ fpr,
1426
+ tpr,
1427
+ mean_auc,
1428
+ lower_ci,
1429
+ upper_ci,
1430
+ model_name=model_name,
1431
+ ax=nexttile(),
1432
+ )
1433
+ plot.figsets(title=model_name, sp=2)
1407
1434
 
1408
- # plot cm
1409
- plot_cm(
1410
- res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1411
- )
1412
- plot.figsets(title=model_name, sp=2)
1435
+ plot_pr_binary(
1436
+ recall=res_val["pr_curve"][model_name]["recall"],
1437
+ precision=res_val["pr_curve"][model_name]["precision"],
1438
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1439
+ model_name=model_name,
1440
+ ax=nexttile(),
1441
+ )
1442
+ plot.figsets(title=model_name, sp=2)
1443
+
1444
+ # plot cm
1445
+ plot_cm(
1446
+ res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1447
+ )
1448
+ plot.figsets(title=model_name, sp=2)
1449
+
1450
+ except Exception as e:
1451
+ print(e)
1413
1452
  else:
1414
1453
 
1415
1454
  modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1424
1463
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1425
1464
  ax = nexttile()
1426
1465
  for iclass, class_ in enumerate(classes):
1427
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1428
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1429
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1430
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1431
- plot_roc_curve(
1432
- fpr,
1433
- tpr,
1434
- mean_auc,
1435
- lower_ci,
1436
- upper_ci,
1437
- model_name=class_,
1438
- lw=1.5,
1439
- color=colors[iclass],
1440
- alpha=0.03,
1441
- ax=ax,
1442
- )
1466
+ try:
1467
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1468
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1469
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1470
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1471
+ plot_roc_curve(
1472
+ fpr,
1473
+ tpr,
1474
+ mean_auc,
1475
+ lower_ci,
1476
+ upper_ci,
1477
+ model_name=class_,
1478
+ lw=1.5,
1479
+ color=colors[iclass],
1480
+ alpha=0.03,
1481
+ ax=ax,
1482
+ )
1483
+ except Exception as e:
1484
+ print(e)
1443
1485
  plot.figsets(
1444
1486
  sp=2,
1445
1487
  title=model_name,
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1451
1493
 
1452
1494
  ax = nexttile()
1453
1495
  for iclass, class_ in enumerate(classes):
1454
- plot_pr_curve(
1455
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1456
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1457
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1458
- iclass
1459
- ],
1460
- model_name=class_,
1461
- color=colors[iclass],
1462
- lw=1.5,
1463
- alpha=0.03,
1464
- ax=ax,
1465
- )
1496
+ try:
1497
+ plot_pr_curve(
1498
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1499
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1500
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1501
+ iclass
1502
+ ],
1503
+ model_name=class_,
1504
+ color=colors[iclass],
1505
+ lw=1.5,
1506
+ alpha=0.03,
1507
+ ax=ax,
1508
+ )
1509
+ except Exception as e:
1510
+ print(e)
1466
1511
  plot.figsets(
1467
1512
  sp=2,
1468
1513
  title=class_,
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
1543
1588
  # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
1544
1589
  sorted_scores = np.array(bootstrapped_scores)
1545
1590
  sorted_scores.sort()
1546
-
1547
- # Computing the lower and upper bound of the 90% confidence interval
1548
- # You can change the bounds percentiles to 0.025 and 0.975 to get
1549
- # a 95% confidence interval instead.
1591
+
1550
1592
  confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1551
1593
  confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1552
1594
  if verbose:
1553
1595
  print(
1554
- "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
1596
+ "Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
1555
1597
  confidence_lower, confidence_upper
1556
1598
  )
1557
1599
  )
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
1568
1610
  y_true, classes=np.unique(y_true)
1569
1611
  ) # One-vs-Rest transformation
1570
1612
  n_classes = y_true_bin.shape[1] # Number of classes
1571
-
1572
- bootstrapped_scores = np.zeros(
1573
- (n_classes, n_bootstraps)
1574
- ) # Store scores for each class
1575
-
1613
+
1614
+ bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
1576
1615
  if verbose:
1577
1616
  print("AUROC scores for each class:")
1578
1617
  for i in range(n_classes):
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
1592
1631
  # Calculating the confidence intervals for each class
1593
1632
  confidence_intervals = []
1594
1633
  for class_idx in range(n_classes):
1595
- sorted_scores = np.sort(bootstrapped_scores[class_idx])
1596
- confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1597
- confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1598
- confidence_intervals.append((confidence_lower, confidence_upper))
1599
-
1600
- if verbose:
1601
- print(
1602
- f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1603
- )
1634
+ # rm nan
1635
+ valid_scores = bootstrapped_scores[class_idx][
1636
+ ~np.isnan(bootstrapped_scores[class_idx])
1637
+ ]
1638
+ if len(valid_scores) > 0:
1639
+ sorted_scores = np.sort(valid_scores)
1640
+ confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1641
+ confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1642
+ confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
1643
+
1644
+ if verbose:
1645
+ print(
1646
+ f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1647
+ )
1648
+ else:
1649
+ confidence_intervals[class_idx] = (np.nan, np.nan)
1650
+ if verbose:
1651
+ print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
1604
1652
 
1605
1653
  return confidence_intervals
1606
1654
 
@@ -2057,20 +2105,20 @@ def rank_models(
2057
2105
 
2058
2106
  def generate_bar_plot(ax, cv_test_scores):
2059
2107
  ax = plot.plotxy(
2060
- y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
2108
+ y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
2061
2109
  )
2062
2110
  plt.title("Classifier Performance")
2063
2111
  plt.tight_layout()
2064
2112
  return plt
2065
2113
 
2066
- nexttile = plot.subplot(2, 2, figsize=[10, 7])
2114
+ nexttile = plot.subplot(2, 2, figsize=[10, 10])
2067
2115
  generate_bar_plot(nexttile(), top_models.dropna())
2068
2116
  plot.radar(
2069
2117
  ax=nexttile(projection="polar"),
2070
2118
  data=cv_test_scores.set_index("Classifier"),
2071
- ylim=[0.5, 1],
2072
- color=plot.get_color(10),
2073
- alpha=0.05,
2119
+ ylim=[0, 1],
2120
+ color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
2121
+ alpha=0.02,
2074
2122
  circular=1,
2075
2123
  )
2076
2124
  return cv_test_scores
@@ -2206,6 +2254,8 @@ def predict(
2206
2254
  y_train: pd.Series,
2207
2255
  x_true: pd.DataFrame = None,
2208
2256
  y_true: Optional[pd.Series] = None,
2257
+ fill_missing:bool = True,
2258
+ scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
2209
2259
  backward: bool = False, # backward_regression
2210
2260
  backward_thr:float = 0.05,# pval thr,only works when backward is True
2211
2261
  common_features: set = None,
@@ -2214,7 +2264,7 @@ def predict(
2214
2264
  metrics: Optional[List[str]] = None,
2215
2265
  stack:bool=True,# run stacking
2216
2266
  stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2217
- vote:bool=True,# run voting
2267
+ vote:bool=False,# run voting
2218
2268
  voting:str="hard", # only for classification purporse of voting
2219
2269
  n_top_models:int=5, #for stacking models
2220
2270
  n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
@@ -2227,7 +2277,12 @@ def predict(
2227
2277
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2228
2278
  class_weight: str = "balanced",
2229
2279
  random_state: int = 1,
2280
+ presets = "best_quality",# specific for autogluon
2281
+ time_limit=600, # specific for autogluon
2282
+ num_bag_folds=5, # specific for autogluon
2283
+ num_stack_levels=2, # specific for autogluon
2230
2284
  verbose: bool = False,
2285
+ **kwargs
2231
2286
  ) -> pd.DataFrame:
2232
2287
  """
2233
2288
  第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
@@ -2278,28 +2333,20 @@ def predict(
2278
2333
  RandomForestRegressor,
2279
2334
  ExtraTreesClassifier,
2280
2335
  ExtraTreesRegressor,
2336
+ HistGradientBoostingRegressor,
2281
2337
  BaggingClassifier,
2282
2338
  BaggingRegressor,
2283
2339
  AdaBoostClassifier,
2284
2340
  AdaBoostRegressor,
2285
2341
  )
2286
- from sklearn.svm import SVC, SVR
2287
- from sklearn.tree import DecisionTreeRegressor
2342
+ from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
2343
+ from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
2288
2344
  from sklearn.linear_model import (
2289
- LogisticRegression,
2290
- ElasticNet,
2291
- ElasticNetCV,
2292
- LinearRegression,
2293
- Lasso,
2294
- RidgeClassifierCV,
2295
- Perceptron,
2296
- SGDClassifier,
2297
- RidgeCV,
2298
- Ridge,
2299
- TheilSenRegressor,
2300
- HuberRegressor,
2301
- PoissonRegressor,
2302
-
2345
+ LogisticRegression,ElasticNet,ElasticNetCV,
2346
+ LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
2347
+ RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
2348
+ GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
2349
+ OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
2303
2350
  )
2304
2351
  from sklearn.compose import TransformedTargetRegressor
2305
2352
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
@@ -2316,15 +2363,21 @@ def predict(
2316
2363
  )
2317
2364
  from sklearn.preprocessing import PolynomialFeatures
2318
2365
  from sklearn.model_selection import train_test_split
2319
-
2366
+
2367
+ from sklearn.gaussian_process import GaussianProcessRegressor
2368
+ from sklearn.kernel_ridge import KernelRidge
2369
+ from sklearn.dummy import DummyRegressor
2370
+ from autogluon.tabular import TabularPredictor
2320
2371
  # 拼写检查
2321
2372
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
2322
2373
  print(f"{purpose} processing...")
2374
+
2375
+
2323
2376
  # Default models or regressors if not provided
2324
2377
  if purpose == "classification":
2325
2378
  model_ = {
2326
2379
  "Random Forest": RandomForestClassifier(
2327
- random_state=random_state, class_weight=class_weight
2380
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2328
2381
  ),
2329
2382
  # SVC (Support Vector Classification)
2330
2383
  "SVM": SVC(
@@ -2335,7 +2388,7 @@ def predict(
2335
2388
  ),
2336
2389
  # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
2337
2390
  "Logistic Regression": LogisticRegression(
2338
- class_weight=class_weight, random_state=random_state
2391
+ class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
2339
2392
  ),
2340
2393
  # Logistic Regression with L1 Regularization (Lasso)
2341
2394
  "Lasso Logistic Regression": LogisticRegression(
@@ -2346,51 +2399,70 @@ def predict(
2346
2399
  eval_metric="logloss",
2347
2400
  random_state=random_state,
2348
2401
  ),
2349
- "KNN": KNeighborsClassifier(n_neighbors=5),
2402
+ "KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
2350
2403
  "Naive Bayes": GaussianNB(),
2351
2404
  "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
2352
2405
  "AdaBoost": AdaBoostClassifier(
2353
2406
  algorithm="SAMME", random_state=random_state
2354
2407
  ),
2355
- # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
2408
+ "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
2356
2409
  "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
2357
2410
  "Extra Trees": ExtraTreesClassifier(
2358
- random_state=random_state, class_weight=class_weight
2411
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2359
2412
  ),
2360
- "Bagging": BaggingClassifier(random_state=random_state),
2413
+ "Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
2361
2414
  "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
2362
2415
  "DecisionTree": DecisionTreeClassifier(),
2363
2416
  "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
2364
2417
  "Ridge": RidgeClassifierCV(
2365
2418
  class_weight=class_weight, store_cv_results=True
2366
2419
  ),
2367
- "Perceptron": Perceptron(random_state=random_state),
2420
+ "Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
2368
2421
  "Bernoulli Naive Bayes": BernoulliNB(),
2369
- "SGDClassifier": SGDClassifier(random_state=random_state),
2422
+ "SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
2370
2423
  }
2371
2424
  elif purpose == "regression":
2372
2425
  model_ = {
2373
- "Random Forest": RandomForestRegressor(random_state=random_state),
2426
+ "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2374
2427
  "SVM": SVR(), # SVR (Support Vector Regression)
2375
- # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2376
- "LassoCV": LassoCV(
2377
- cv=cv_folds, random_state=random_state
2378
- ), # LassoCV自动找出最适alpha,优于Lasso
2428
+ "LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
2379
2429
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2380
- "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2381
- "Linear Regression": LinearRegression(),
2430
+ "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2431
+ "Linear Regression": LinearRegression(n_jobs=n_jobs),
2382
2432
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2383
- # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2433
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
2384
2434
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2385
- "Extra Trees": ExtraTreesRegressor(random_state=random_state),
2386
- "Bagging": BaggingRegressor(random_state=random_state),
2435
+ "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2436
+ "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2387
2437
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2388
2438
  "ElasticNet": ElasticNet(random_state=random_state),
2389
- "Ridge": Ridge(),
2390
- "KNN": KNeighborsRegressor(),
2391
- "TheilSen":TheilSenRegressor(),
2439
+ "Ridge": Ridge(random_state=random_state),
2440
+ "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2441
+ "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2392
2442
  "Huber":HuberRegressor(),
2393
- "Poisson":PoissonRegressor()
2443
+ "Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
2444
+ "Lasso": Lasso(random_state=random_state),
2445
+ "Lars": Lars(),
2446
+ "LassoLars": LassoLars(),
2447
+ "BayesianRidge": BayesianRidge(),
2448
+ "GammaRegressor": GammaRegressor(),
2449
+ "TweedieRegressor": TweedieRegressor(),
2450
+ "LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
2451
+ "ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
2452
+ "LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
2453
+ "LarsCV": LarsCV(),
2454
+ "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
2455
+ "OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
2456
+ "PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
2457
+ "LinearSVR": LinearSVR(random_state=random_state),
2458
+ "NuSVR": NuSVR(),
2459
+ "DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
2460
+ "ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
2461
+ "HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
2462
+ "GaussianProcessRegressor": GaussianProcessRegressor(),
2463
+ "KernelRidge": KernelRidge(),
2464
+ "DummyRegressor": DummyRegressor(),
2465
+ "TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
2394
2466
  }
2395
2467
  if cls is None:
2396
2468
  models = model_
@@ -2407,10 +2479,17 @@ def predict(
2407
2479
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2408
2480
  )
2409
2481
 
2482
+ # only keep "autogluon_tab" in models
2483
+ cls = [cls] if isinstance(cls, str) else cls
2484
+
2485
+ if cls is not None:
2486
+ models={"autogluon_tab":None} if "auto" in cls else models
2487
+
2410
2488
  # indicate cls:
2411
2489
  if ips.run_once_within(30): # 10 min
2412
2490
  print(f"processing: {list(models.keys())}")
2413
-
2491
+ y_train_col_name=None
2492
+ # print(isinstance(y_train, str) and y_train in x_train.columns)
2414
2493
  if isinstance(y_train, str) and y_train in x_train.columns:
2415
2494
  y_train_col_name = y_train
2416
2495
  y_train = x_train[y_train]
@@ -2418,6 +2497,7 @@ def predict(
2418
2497
  x_train = x_train.drop(y_train_col_name, axis=1)
2419
2498
  # else:
2420
2499
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2500
+
2421
2501
  y_train = pd.DataFrame(y_train)
2422
2502
  if y_train.select_dtypes(include=np.number).empty:
2423
2503
  y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
@@ -2430,6 +2510,9 @@ def predict(
2430
2510
  y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2431
2511
  print("is_binary:", is_binary)
2432
2512
 
2513
+ if fill_missing:
2514
+ ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
2515
+ ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
2433
2516
  # Perform backward feature selection
2434
2517
  if backward:
2435
2518
  selected_features = backward_regression(x_train, y_train, thr=backward_thr)
@@ -2458,6 +2541,8 @@ def predict(
2458
2541
  pd.DataFrame(y_train), method="label"
2459
2542
  ).values.ravel()
2460
2543
 
2544
+ if fill_missing:
2545
+ ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
2461
2546
  if y_true is not None:
2462
2547
  if isinstance(y_true, str) and y_true in x_true.columns:
2463
2548
  y_true_col_name = y_true
@@ -2490,11 +2575,16 @@ def predict(
2490
2575
  # Ensure common features are selected
2491
2576
  if common_features is not None:
2492
2577
  x_train, x_true = x_train[common_features], x_true[common_features]
2578
+ share_col_names=common_features
2493
2579
  else:
2494
2580
  share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
2495
2581
  x_train, x_true = x_train[share_col_names], x_true[share_col_names]
2496
2582
 
2497
- x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
2583
+ #! scaler
2584
+ # scaler and fit x_train and export scaler to fit the x_true
2585
+ x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
2586
+ #
2587
+ x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
2498
2588
  x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
2499
2589
  x_true, method="dummy"
2500
2590
  )
@@ -2516,18 +2606,261 @@ def predict(
2516
2606
  if isinstance(y_train, np.ndarray):
2517
2607
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2518
2608
  y_true = np.asarray(y_true)
2519
- # Hyperparameter grids for tuning
2520
- param_grid_common_xgb = {
2521
- 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2522
- 'max_depth': [3, 5, 7, 10],
2523
- 'n_estimators': [50, 100, 200, 300],
2524
- 'subsample': [0.6, 0.8, 1.0],
2525
- 'colsample_bytree': [0.6, 0.8, 1.0],
2526
- 'gamma': [0, 0.1, 0.2, 0.5],
2527
- 'min_child_weight': [1, 5, 10],
2528
- 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2529
- 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2609
+ #! so far, got the: x_train,x_true,y_train,y_true
2610
+ # Grid search with KFold or StratifiedKFold
2611
+ if "autogluon_tab" in models:
2612
+ # load hypoer_param
2613
+ f_param = os.path.dirname(os.path.abspath(__file__))
2614
+ f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
2615
+ with open(f_param, "r") as file:
2616
+ hyper_param_autogluon = json.load(file)
2617
+ # Train the model with AutoGluon
2618
+ features=x_train.columns.tolist()
2619
+ label= y_train_col_name if y_train_col_name is not None else 'target'
2620
+ df_autogluon = x_train.copy()
2621
+ df_autogluon[label]=y_train
2622
+ autogluon_presets=["best_quality","good_quality","fast_train"]
2623
+ best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
2624
+ train_data=df_autogluon,
2625
+ presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
2626
+ time_limit=time_limit,#3600, # in sec: Limit training time,
2627
+ num_bag_folds=num_bag_folds,
2628
+ num_stack_levels=num_stack_levels,
2629
+ hyperparameters=hyper_param_autogluon,
2630
+ verbosity=1 if verbose else 0,
2631
+ **kwargs
2632
+ )
2633
+ #! Get the leaderboard
2634
+ gs={}
2635
+ # Display the leaderboard for reference
2636
+ leaderboard = best_clf.leaderboard()
2637
+ gs['info']=best_clf.info()
2638
+ # gs["res"]=best_clf
2639
+ gs["features"]=features
2640
+ gs["leaderboard"] = leaderboard
2641
+ best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
2642
+ # Store the best model and its details in the gs dictionary
2643
+ gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
2644
+ gs["best_params_"] = best_model_name # Hyperparameters
2645
+ # Make predictions if x_true is provided
2646
+ if x_true is not None:
2647
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2648
+ gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
2649
+ gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
2650
+ x_true[label]=gs["predictions"]
2651
+ if gs["predictions"].value_counts().shape[0]>1:
2652
+ gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
2653
+ gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
2654
+ all_models = gs["models"]
2655
+ model_evaluations = {}
2656
+ for model in all_models:
2657
+ predictions = best_clf.predict(x_true[features], model=model)
2658
+ evaluation = best_clf.evaluate_predictions(
2659
+ y_true=x_true[label], # True labels
2660
+ y_pred=predictions, # Predictions from the specific model
2661
+ auxiliary_metrics=True, # Include additional metrics if needed
2662
+ )
2663
+ model_evaluations[model] = evaluation
2664
+ gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
2665
+ #! 试着保持一样的格式
2666
+ results = {}
2667
+ for model in all_models:
2668
+ y_pred = best_clf.predict(x_true[features], model=model).tolist()
2669
+ y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
2670
+
2671
+ if isinstance(y_pred_proba, pd.DataFrame):
2672
+ y_pred_proba=y_pred_proba.iloc[:,1]
2673
+
2674
+ # try to make predict format consistant
2675
+ try:
2676
+ y_pred= [i[0] for i in y_pred]
2677
+ except:
2678
+ pass
2679
+ try:
2680
+ y_true= [i[0] for i in y_true]
2681
+ except:
2682
+ pass
2683
+ try:
2684
+ y_train= [i[0] for i in y_train]
2685
+ except:
2686
+ pass
2687
+ validation_scores = {}
2688
+ if y_true is not None and y_pred_proba is not None:
2689
+ validation_scores = cal_metrics(
2690
+ y_true,
2691
+ y_pred,
2692
+ y_pred_proba=y_pred_proba,
2693
+ is_binary=is_binary,
2694
+ purpose=purpose,
2695
+ average="weighted",
2696
+ )
2697
+ if is_binary:
2698
+ # Calculate ROC curve
2699
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
2700
+ if y_pred_proba is not None:
2701
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2702
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2703
+ lower_ci, upper_ci = cal_auc_ci(
2704
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2705
+ )
2706
+ roc_auc = auc(fpr, tpr)
2707
+ roc_info = {
2708
+ "fpr": fpr.tolist(),
2709
+ "tpr": tpr.tolist(),
2710
+ "auc": roc_auc,
2711
+ "ci95": (lower_ci, upper_ci),
2712
+ }
2713
+ # precision-recall curve
2714
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
2715
+ avg_precision_ = average_precision_score(y_true, y_pred_proba)
2716
+ pr_info = {
2717
+ "precision": precision_,
2718
+ "recall": recall_,
2719
+ "avg_precision": avg_precision_,
2720
+ }
2721
+ else:
2722
+ roc_info, pr_info = None, None
2723
+ if purpose == "classification":
2724
+ results[model] = {
2725
+ # "best_clf": gs.best_estimator_,
2726
+ # "best_params": gs.best_params_,
2727
+ # "auc_indiv": [
2728
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2729
+ # for i in range(cv_folds)
2730
+ # ],
2731
+ "scores": validation_scores,
2732
+ "roc_curve": roc_info,
2733
+ "pr_curve": pr_info,
2734
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2735
+ "predictions": y_pred,#.tolist(),
2736
+ "predictions_proba": (
2737
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2738
+ ),
2739
+ "features":features,
2740
+ # "coef":coef_,
2741
+ # "alphas":alphas_
2742
+ }
2743
+ else: # "regression"
2744
+ results[model] = {
2745
+ # "best_clf": gs.best_estimator_,
2746
+ # "best_params": gs.best_params_,
2747
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2748
+ "predictions": y_pred,#.tolist(),
2749
+ "predictions_proba": (
2750
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2751
+ ),
2752
+ "features":features,
2753
+ # "coef":coef_,
2754
+ # "alphas":alphas_
2755
+ }
2756
+ else: # multi-classes
2757
+ if y_pred_proba is not None:
2758
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2759
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2760
+ confidence_intervals = cal_auc_ci(
2761
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2762
+ )
2763
+ roc_info = {
2764
+ "fpr": validation_scores["fpr"],
2765
+ "tpr": validation_scores["tpr"],
2766
+ "auc": validation_scores["roc_auc_by_class"],
2767
+ "ci95": confidence_intervals,
2768
+ }
2769
+ # precision-recall curve
2770
+ precision_, recall_, avg_precision_ = cal_precision_recall(
2771
+ y_true, y_pred_proba, is_binary=is_binary
2772
+ )
2773
+ pr_info = {
2774
+ "precision": precision_,
2775
+ "recall": recall_,
2776
+ "avg_precision": avg_precision_,
2777
+ }
2778
+ else:
2779
+ roc_info, pr_info = None, None
2780
+
2781
+ if purpose == "classification":
2782
+ results[model] = {
2783
+ # "best_clf": gs.best_estimator_,
2784
+ # "best_params": gs.best_params_,
2785
+ # "auc_indiv": [
2786
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2787
+ # for i in range(cv_folds)
2788
+ # ],
2789
+ "scores": validation_scores,
2790
+ "roc_curve": roc_info,
2791
+ "pr_curve": pr_info,
2792
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2793
+ "predictions": y_pred,#.tolist(),
2794
+ "predictions_proba": (
2795
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2796
+ ),
2797
+ "features":features,
2798
+ # "coef":coef_,
2799
+ # "alphas":alphas_
2800
+ }
2801
+ else: # "regression"
2802
+ results[model] = {
2803
+ # "best_clf": gs.best_estimator_,
2804
+ # "best_params": gs.best_params_,
2805
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2806
+ "predictions": y_pred,#.tolist(),
2807
+ "predictions_proba": (
2808
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2809
+ ),
2810
+ "features":features,
2811
+ # "coef":coef_,
2812
+ # "alphas":alphas_
2813
+ }
2814
+
2815
+ else:
2816
+ if y_true is None:
2817
+ validation_scores = []
2818
+ else:
2819
+ validation_scores = cal_metrics(
2820
+ y_true,
2821
+ y_pred,
2822
+ y_pred_proba=y_pred_proba,
2823
+ is_binary=is_binary,
2824
+ purpose=purpose,
2825
+ average="weighted",
2826
+ )
2827
+ results[model] = {
2828
+ # "best_clf": gs.best_estimator_,
2829
+ # "best_params": gs.best_params_,
2830
+ "scores": validation_scores,
2831
+ "predictions": y_pred,#.tolist(),
2832
+ "predictions_proba": (
2833
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2834
+ ),
2835
+ "features":features,
2836
+ "y_train": y_train if y_train is not None else [],
2837
+ "y_true": y_true if y_true is not None else [],
2838
+ # "coef":coef_,
2839
+ # "alphas":alphas_
2530
2840
  }
2841
+ df_results = pd.DataFrame.from_dict(results, orient="index")
2842
+ gs['res']=df_results
2843
+
2844
+ if all([plot_, y_true is not None, purpose == "classification"]):
2845
+ from datetime import datetime
2846
+
2847
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
2848
+ # try:
2849
+ if df_results.shape[0] > 3:
2850
+ try:
2851
+ plot_validate_features(df_results, is_binary=is_binary)
2852
+ except Exception as e:
2853
+ print(e)
2854
+ else:
2855
+ try:
2856
+ plot_validate_features_single(df_results, is_binary=is_binary)
2857
+ except Exception as e:
2858
+ print(e)
2859
+ if dir_save:
2860
+ ips.figsave(dir_save + f"validate_features{now_}.pdf")
2861
+ return gs
2862
+
2863
+ #! cross_valid
2531
2864
  if cv_level in ["low", "simple", "s", "l"]:
2532
2865
  param_grids = {
2533
2866
  "Random Forest": (
@@ -2696,7 +3029,73 @@ def predict(
2696
3029
  'alpha': [0.1],
2697
3030
  'max_iter': [100],},
2698
3031
  "Poisson":{'alpha': [0.1],
2699
- 'max_iter': [100],}
3032
+ 'max_iter': [100],},
3033
+ "Lars": {"n_nonzero_coefs": [10, 50, None]},
3034
+ "LassoLars": {
3035
+ "alpha": [0.01, 0.1, 1]
3036
+ },
3037
+ "BayesianRidge": {
3038
+ "alpha_1": [1e-6, 1e-4, 1e-2],
3039
+ "lambda_1": [1e-6, 1e-4, 1e-2]
3040
+ },
3041
+ "GammaRegressor": {
3042
+ "alpha": [0.1, 1, 10]
3043
+ },
3044
+ "TweedieRegressor": {
3045
+ "alpha": [0.1, 1, 10],
3046
+ "power": [1, 1.5, 2]
3047
+ },
3048
+ "LassoCV": {
3049
+ "cv": [5]
3050
+ },
3051
+ "ElasticNetCV": {
3052
+ "l1_ratio": [0.2, 0.5, 0.8],
3053
+ "cv": [5]
3054
+ },
3055
+ "LassoLarsCV": {
3056
+ "cv": [5]
3057
+ },
3058
+ "LarsCV": {
3059
+ "cv": [5]
3060
+ },
3061
+ "OrthogonalMatchingPursuit": {
3062
+ "n_nonzero_coefs": [10, 50, None]
3063
+ },
3064
+ "OrthogonalMatchingPursuitCV": {
3065
+ "cv": [5]
3066
+ },
3067
+ "PassiveAggressiveRegressor": {
3068
+ "C": [0.1, 1, 10]
3069
+ },
3070
+ "LinearSVR": {
3071
+ "C": [0.1, 1, 10]
3072
+ },
3073
+ "NuSVR": {
3074
+ "C": [0.1, 1, 10]
3075
+ },
3076
+ "DecisionTreeRegressor": {
3077
+ "max_depth": [5, 10, None]
3078
+ },
3079
+ "ExtraTreeRegressor": {
3080
+ "max_depth": [5, 10, None]
3081
+ },
3082
+ "HistGradientBoostingRegressor": {
3083
+ "learning_rate": [0.05, 0.1, 0.2],
3084
+ "max_depth": [5, 10, None]
3085
+ },
3086
+ "GaussianProcessRegressor": {
3087
+ "alpha": [1e-5, 1e-2, 0.1]
3088
+ },
3089
+ "KernelRidge": {
3090
+ "alpha": [0.1, 1, 10],
3091
+ "kernel": ["linear", "rbf"]
3092
+ },
3093
+ "DummyRegressor": {
3094
+ "strategy": ["mean", "median"]
3095
+ },
3096
+ "TransformedTargetRegressor": {
3097
+ "regressor__fit_intercept": [True, False]
3098
+ }
2700
3099
  }
2701
3100
  elif cv_level in ["high", "advanced", "h"]:
2702
3101
  param_grids = {
@@ -2901,7 +3300,96 @@ def predict(
2901
3300
  'alpha': [0.1, 1.0, 10.0],
2902
3301
  'max_iter': [100, 200, 300],},
2903
3302
  "Poisson":{'alpha': [0.1, 1.0, 10.0],
2904
- 'max_iter': [100, 200, 300],}
3303
+ 'max_iter': [100, 200, 300],},
3304
+ "Lars": {
3305
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3306
+ },
3307
+ "LassoLars": {
3308
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3309
+ },
3310
+ "BayesianRidge": {
3311
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3312
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3313
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3314
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3315
+ },
3316
+ "GammaRegressor": {
3317
+ "alpha": [0.01, 0.1, 1, 10],
3318
+ "max_iter": [1000, 5000, 10000]
3319
+ },
3320
+ "TweedieRegressor": {
3321
+ "alpha": [0.01, 0.1, 1, 10],
3322
+ "power": [0, 1, 1.5, 2, 3]
3323
+ },
3324
+ "LassoCV": {
3325
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3326
+ "cv": [3, 5, 10]
3327
+ },
3328
+ "ElasticNetCV": {
3329
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3330
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3331
+ "cv": [3, 5, 10]
3332
+ },
3333
+ "LassoLarsCV": {
3334
+ "cv": [3, 5, 10]
3335
+ },
3336
+ "LarsCV": {
3337
+ "cv": [3, 5, 10]
3338
+ },
3339
+ "OrthogonalMatchingPursuit": {
3340
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3341
+ },
3342
+ "OrthogonalMatchingPursuitCV": {
3343
+ "cv": [3, 5, 10]
3344
+ },
3345
+ "PassiveAggressiveRegressor": {
3346
+ "C": [0.01, 0.1, 1, 10],
3347
+ "max_iter": [1000, 5000, 10000],
3348
+ "early_stopping": [True, False]
3349
+ },
3350
+ "LinearSVR": {
3351
+ "C": [0.01, 0.1, 1, 10],
3352
+ "epsilon": [0.01, 0.1, 1],
3353
+ "max_iter": [1000, 5000, 10000]
3354
+ },
3355
+ "NuSVR": {
3356
+ "C": [0.01, 0.1, 1, 10],
3357
+ "nu": [0.25, 0.5, 0.75],
3358
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3359
+ },
3360
+ "DecisionTreeRegressor": {
3361
+ "max_depth": [None, 5, 10, 20],
3362
+ "min_samples_split": [2, 5, 10],
3363
+ "min_samples_leaf": [1, 2, 4]
3364
+ },
3365
+ "ExtraTreeRegressor": {
3366
+ "max_depth": [None, 5, 10, 20],
3367
+ "min_samples_split": [2, 5, 10],
3368
+ "min_samples_leaf": [1, 2, 4]
3369
+ },
3370
+ "HistGradientBoostingRegressor": {
3371
+ "learning_rate": [0.01, 0.1, 0.2],
3372
+ "max_iter": [100, 500, 1000],
3373
+ "max_depth": [None, 5, 10, 20],
3374
+ "min_samples_leaf": [1, 2, 4]
3375
+ },
3376
+ "GaussianProcessRegressor": {
3377
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3378
+ "n_restarts_optimizer": [0, 1, 5, 10]
3379
+ },
3380
+ "KernelRidge": {
3381
+ "alpha": [0.01, 0.1, 1, 10],
3382
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3383
+ "degree": [2, 3, 4]
3384
+ },
3385
+ "DummyRegressor": {
3386
+ "strategy": ["mean", "median", "constant"],
3387
+ "constant": [0] # Only if strategy is 'constant'
3388
+ },
3389
+ "TransformedTargetRegressor": {
3390
+ # Grid for the underlying regressor, example shown for LinearRegression
3391
+ "regressor__fit_intercept": [True, False]
3392
+ }
2905
3393
  }
2906
3394
  else: # median level
2907
3395
  param_grids = {
@@ -3148,7 +3636,96 @@ def predict(
3148
3636
  'alpha': [0.1, 1.0],
3149
3637
  'max_iter': [100, 200],},
3150
3638
  "Poisson":{'alpha': [0.1, 1.0],
3151
- 'max_iter': [100, 200],}
3639
+ 'max_iter': [100, 200],},
3640
+ "Lars": {
3641
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3642
+ },
3643
+ "LassoLars": {
3644
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3645
+ },
3646
+ "BayesianRidge": {
3647
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3648
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3649
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3650
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3651
+ },
3652
+ "GammaRegressor": {
3653
+ "alpha": [0.01, 0.1, 1, 10],
3654
+ "max_iter": [1000, 5000, 10000]
3655
+ },
3656
+ "TweedieRegressor": {
3657
+ "alpha": [0.01, 0.1, 1, 10],
3658
+ "power": [0, 1, 1.5, 2, 3]
3659
+ },
3660
+ "LassoCV": {
3661
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3662
+ "cv": [3, 5, 10]
3663
+ },
3664
+ "ElasticNetCV": {
3665
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3666
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3667
+ "cv": [3, 5, 10]
3668
+ },
3669
+ "LassoLarsCV": {
3670
+ "cv": [3, 5, 10]
3671
+ },
3672
+ "LarsCV": {
3673
+ "cv": [3, 5, 10]
3674
+ },
3675
+ "OrthogonalMatchingPursuit": {
3676
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3677
+ },
3678
+ "OrthogonalMatchingPursuitCV": {
3679
+ "cv": [3, 5, 10]
3680
+ },
3681
+ "PassiveAggressiveRegressor": {
3682
+ "C": [0.01, 0.1, 1, 10],
3683
+ "max_iter": [1000, 5000, 10000],
3684
+ "early_stopping": [True, False]
3685
+ },
3686
+ "LinearSVR": {
3687
+ "C": [0.01, 0.1, 1, 10],
3688
+ "epsilon": [0.01, 0.1, 1],
3689
+ "max_iter": [1000, 5000, 10000]
3690
+ },
3691
+ "NuSVR": {
3692
+ "C": [0.01, 0.1, 1, 10],
3693
+ "nu": [0.25, 0.5, 0.75],
3694
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3695
+ },
3696
+ "DecisionTreeRegressor": {
3697
+ "max_depth": [None, 5, 10, 20],
3698
+ "min_samples_split": [2, 5, 10],
3699
+ "min_samples_leaf": [1, 2, 4]
3700
+ },
3701
+ "ExtraTreeRegressor": {
3702
+ "max_depth": [None, 5, 10, 20],
3703
+ "min_samples_split": [2, 5, 10],
3704
+ "min_samples_leaf": [1, 2, 4]
3705
+ },
3706
+ "HistGradientBoostingRegressor": {
3707
+ "learning_rate": [0.01, 0.1, 0.2],
3708
+ "max_iter": [100, 500, 1000],
3709
+ "max_depth": [None, 5, 10, 20],
3710
+ "min_samples_leaf": [1, 2, 4]
3711
+ },
3712
+ "GaussianProcessRegressor": {
3713
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3714
+ "n_restarts_optimizer": [0, 1, 5, 10]
3715
+ },
3716
+ "KernelRidge": {
3717
+ "alpha": [0.01, 0.1, 1, 10],
3718
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3719
+ "degree": [2, 3, 4]
3720
+ },
3721
+ "DummyRegressor": {
3722
+ "strategy": ["mean", "median", "constant"],
3723
+ "constant": [0] # Only if strategy is 'constant'
3724
+ },
3725
+ "TransformedTargetRegressor": {
3726
+ # Grid for the underlying regressor, example shown for LinearRegression
3727
+ "regressor__fit_intercept": [True, False]
3728
+ }
3152
3729
  }
3153
3730
 
3154
3731
  results = {}
@@ -3158,7 +3735,7 @@ def predict(
3158
3735
  if purpose == "classification"
3159
3736
  else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
3160
3737
  )
3161
-
3738
+
3162
3739
  # Train and validate each model
3163
3740
  for name, clf in tqdm(
3164
3741
  models.items(),
@@ -3168,83 +3745,132 @@ def predict(
3168
3745
  ):
3169
3746
  if verbose:
3170
3747
  print(f"\nTraining and validating {name}:")
3171
-
3172
- # Grid search with KFold or StratifiedKFold
3173
- if is_binary:
3174
- gs = GridSearchCV(
3175
- clf,
3176
- param_grid=param_grids.get(name, {}),
3177
- scoring=(
3178
- "roc_auc"
3179
- if purpose == "classification"
3180
- else "neg_mean_squared_error"
3181
- ),
3182
- cv=cv,
3183
- n_jobs=n_jobs,
3184
- verbose=verbose,
3185
- )
3186
-
3187
- gs.fit(x_train, y_train)
3188
- best_clf = gs.best_estimator_
3189
- # make sure x_train and x_test has the same name
3190
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3191
- y_pred = best_clf.predict(x_true)
3192
- if hasattr(best_clf, "predict_proba"):
3193
- y_pred_proba = best_clf.predict_proba(x_true)
3194
- print("Shape of predicted probabilities:", y_pred_proba.shape)
3195
- if y_pred_proba.shape[1] == 1:
3196
- y_pred_proba = np.hstack(
3197
- [1 - y_pred_proba, y_pred_proba]
3198
- ) # Add missing class probabilities
3199
- y_pred_proba = y_pred_proba[:, 1]
3200
- elif hasattr(best_clf, "decision_function"):
3201
- # If predict_proba is not available, use decision_function (e.g., for SVM)
3202
- y_pred_proba = best_clf.decision_function(x_true)
3203
- # Ensure y_pred_proba is within 0 and 1 bounds
3204
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3205
- y_pred_proba.max() - y_pred_proba.min()
3748
+ try:
3749
+ if is_binary:
3750
+ gs = GridSearchCV(
3751
+ clf,
3752
+ param_grid=param_grids.get(name, {}),
3753
+ scoring=(
3754
+ "roc_auc"
3755
+ if purpose == "classification"
3756
+ else "neg_mean_squared_error"
3757
+ ),
3758
+ cv=cv,
3759
+ n_jobs=n_jobs,
3760
+ verbose=verbose,
3206
3761
  )
3207
- else:
3208
- y_pred_proba = None # No probability output for certain models
3209
- else:
3210
- gs = GridSearchCV(
3211
- clf,
3212
- param_grid=param_grids.get(name, {}),
3213
- scoring=(
3214
- "roc_auc_ovr"
3215
- if purpose == "classification"
3216
- else "neg_mean_squared_error"
3217
- ),
3218
- cv=cv,
3219
- n_jobs=n_jobs,
3220
- verbose=verbose,
3221
- )
3222
3762
 
3223
- # Fit GridSearchCV
3224
- gs.fit(x_train, y_train)
3225
- best_clf = gs.best_estimator_
3226
-
3227
- # Ensure x_true aligns with x_train columns
3228
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3229
- y_pred = best_clf.predict(x_true)
3230
-
3231
- # Handle prediction probabilities for multiclass
3232
- if hasattr(best_clf, "predict_proba"):
3233
- y_pred_proba = best_clf.predict_proba(x_true)
3234
- elif hasattr(best_clf, "decision_function"):
3235
- y_pred_proba = best_clf.decision_function(x_true)
3236
-
3237
- # Normalize for multiclass if necessary
3238
- if y_pred_proba.ndim == 2:
3239
- y_pred_proba = (
3240
- y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3241
- ) / (
3242
- y_pred_proba.max(axis=1, keepdims=True)
3243
- - y_pred_proba.min(axis=1, keepdims=True)
3763
+ gs.fit(x_train, y_train)
3764
+ best_clf = gs.best_estimator_
3765
+
3766
+ # make sure x_train and x_test has the same name
3767
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3768
+ y_pred = best_clf.predict(x_true)
3769
+ if hasattr(best_clf, "predict_proba"):
3770
+ y_pred_proba = best_clf.predict_proba(x_true)
3771
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
3772
+ if y_pred_proba.shape[1] == 1:
3773
+ y_pred_proba = np.hstack(
3774
+ [1 - y_pred_proba, y_pred_proba]
3775
+ ) # Add missing class probabilities
3776
+ if y_pred_proba.shape[1] == 2:
3777
+ if isinstance(y_pred_proba, pd.DataFrame):
3778
+ y_pred_proba = y_pred_proba.iloc[:, 1]
3779
+ elif isinstance(y_pred_proba, pd.Series):
3780
+ y_pred_proba = y_pred_proba.values[:, 1]
3781
+ else:
3782
+ y_pred_proba = y_pred_proba[:, 1]
3783
+
3784
+ elif hasattr(best_clf, "decision_function"):
3785
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3786
+ y_pred_proba = best_clf.decision_function(x_true)
3787
+ # Ensure y_pred_proba is within 0 and 1 bounds
3788
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3789
+ y_pred_proba.max() - y_pred_proba.min()
3244
3790
  )
3245
- else:
3246
- y_pred_proba = None # No probability output for certain models
3791
+ else:
3792
+ y_pred_proba = None # No probability output for certain models
3793
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3794
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3795
+ if hasattr(best_clf, "alphas_"):
3796
+ alphas_ = best_clf.alphas_
3797
+ elif hasattr(best_clf, "alpha_"):
3798
+ alphas_ = best_clf.alpha_
3799
+ elif hasattr(best_clf, "Cs_"):
3800
+ alphas_ = best_clf.Cs_
3801
+ else:
3802
+ alphas_= None
3803
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3804
+ else:
3805
+ gs = GridSearchCV(
3806
+ clf,
3807
+ param_grid=param_grids.get(name, {}),
3808
+ scoring=(
3809
+ "roc_auc_ovr"
3810
+ if purpose == "classification"
3811
+ else "neg_mean_squared_error"
3812
+ ),
3813
+ cv=cv,
3814
+ n_jobs=n_jobs,
3815
+ verbose=verbose,
3816
+ )
3247
3817
 
3818
+ # Fit GridSearchCV
3819
+ gs.fit(x_train, y_train)
3820
+ best_clf = gs.best_estimator_
3821
+
3822
+ # Ensure x_true aligns with x_train columns
3823
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3824
+
3825
+ # do i need to fit the x_train, y_train again?
3826
+ best_clf=best_clf.fit(x_train, y_train)
3827
+ y_pred = best_clf.predict(x_true)
3828
+
3829
+ # Handle prediction probabilities for multiclass
3830
+ if hasattr(best_clf, "predict_proba"):
3831
+ y_pred_proba = best_clf.predict_proba(x_true)
3832
+ elif hasattr(best_clf, "decision_function"):
3833
+ y_pred_proba = best_clf.decision_function(x_true)
3834
+
3835
+ # Normalize for multiclass if necessary
3836
+ if y_pred_proba.ndim == 2:
3837
+ y_pred_proba = (
3838
+ y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3839
+ ) / (
3840
+ y_pred_proba.max(axis=1, keepdims=True)
3841
+ - y_pred_proba.min(axis=1, keepdims=True)
3842
+ )
3843
+ else:
3844
+ y_pred_proba = None # No probability output for certain models
3845
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3846
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3847
+ if hasattr(best_clf, "alphas_"):
3848
+ alphas_ = best_clf.alphas_
3849
+ elif hasattr(best_clf, "alpha_"):
3850
+ alphas_ = best_clf.alpha_
3851
+ elif hasattr(best_clf, "Cs_"):
3852
+ alphas_ = best_clf.Cs_
3853
+ else:
3854
+ alphas_= None
3855
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3856
+ except Exception as e:
3857
+ alphas_,coef_ = None,None
3858
+ print(f"skiped {clf}: {e}")
3859
+ continue
3860
+
3861
+ # try to make predict format consistant
3862
+ try:
3863
+ y_pred= [i[0] for i in y_pred]
3864
+ except:
3865
+ pass
3866
+ try:
3867
+ y_true= [i[0] for i in y_true]
3868
+ except:
3869
+ pass
3870
+ try:
3871
+ y_train= [i[0] for i in y_train]
3872
+ except:
3873
+ pass
3248
3874
  validation_scores = {}
3249
3875
 
3250
3876
  if y_true is not None and y_pred_proba is not None:
@@ -3294,20 +3920,26 @@ def predict(
3294
3920
  "roc_curve": roc_info,
3295
3921
  "pr_curve": pr_info,
3296
3922
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3297
- "predictions": y_pred.tolist(),
3923
+ "predictions": y_pred,#.tolist(),
3298
3924
  "predictions_proba": (
3299
3925
  y_pred_proba.tolist() if y_pred_proba is not None else None
3300
3926
  ),
3927
+ "features":share_col_names,
3928
+ "coef":coef_,
3929
+ "alphas":alphas_
3301
3930
  }
3302
3931
  else: # "regression"
3303
3932
  results[name] = {
3304
3933
  "best_clf": gs.best_estimator_,
3305
3934
  "best_params": gs.best_params_,
3306
3935
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3307
- "predictions": y_pred.tolist(),
3936
+ "predictions": y_pred,#.tolist(),
3308
3937
  "predictions_proba": (
3309
3938
  y_pred_proba.tolist() if y_pred_proba is not None else None
3310
3939
  ),
3940
+ "features":share_col_names,
3941
+ "coef":coef_,
3942
+ "alphas":alphas_
3311
3943
  }
3312
3944
  else: # multi-classes
3313
3945
  if y_pred_proba is not None:
@@ -3346,20 +3978,26 @@ def predict(
3346
3978
  "roc_curve": roc_info,
3347
3979
  "pr_curve": pr_info,
3348
3980
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3349
- "predictions": y_pred.tolist(),
3981
+ "predictions": y_pred,#.tolist(),
3350
3982
  "predictions_proba": (
3351
3983
  y_pred_proba.tolist() if y_pred_proba is not None else None
3352
3984
  ),
3985
+ "features":share_col_names,
3986
+ "coef":coef_,
3987
+ "alphas":alphas_
3353
3988
  }
3354
3989
  else: # "regression"
3355
3990
  results[name] = {
3356
3991
  "best_clf": gs.best_estimator_,
3357
3992
  "best_params": gs.best_params_,
3358
3993
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3359
- "predictions": y_pred.tolist(),
3994
+ "predictions": y_pred,#.tolist(),
3360
3995
  "predictions_proba": (
3361
3996
  y_pred_proba.tolist() if y_pred_proba is not None else None
3362
3997
  ),
3998
+ "features":share_col_names,
3999
+ "coef":coef_,
4000
+ "alphas":alphas_
3363
4001
  }
3364
4002
 
3365
4003
  else:
@@ -3378,17 +4016,21 @@ def predict(
3378
4016
  "best_clf": gs.best_estimator_,
3379
4017
  "best_params": gs.best_params_,
3380
4018
  "scores": validation_scores,
3381
- "predictions": y_pred.tolist(),
4019
+ "predictions": y_pred,#.tolist(),
3382
4020
  "predictions_proba": (
3383
4021
  y_pred_proba.tolist() if y_pred_proba is not None else None
3384
4022
  ),
4023
+ "features":share_col_names,
3385
4024
  "y_train": y_train if y_train is not None else [],
3386
4025
  "y_true": y_true if y_true is not None else [],
4026
+ "coef":coef_,
4027
+ "alphas":alphas_
3387
4028
  }
3388
4029
 
3389
4030
  # Convert results to DataFrame
3390
4031
  df_results = pd.DataFrame.from_dict(results, orient="index")
3391
- # sort
4032
+ display(df_results)
4033
+ # sort
3392
4034
  if y_true is not None:
3393
4035
  if purpose == "classification":
3394
4036
  df_scores = pd.DataFrame(
@@ -3446,7 +4088,7 @@ def predict(
3446
4088
  for i, j in top_models.to_dict().items():
3447
4089
  base_estimators.append((i, j))
3448
4090
  if stacking_cv:
3449
- print(f" ⤵ stacking_cv is processing...")
4091
+ print(f"⤵ stacking_cv is processing...")
3450
4092
  #* 定义几个象征性的final_estimator
3451
4093
  # 备选的几种
3452
4094
  if purpose == "classification":
@@ -3520,7 +4162,7 @@ def predict(
3520
4162
  best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3521
4163
  print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3522
4164
  else:
3523
- print(f" ⤵ trying to find the best_final_estimator for stacking...")
4165
+ print(f"⤵ trying to find the best_final_estimator for stacking...")
3524
4166
  if purpose=="classification":
3525
4167
  best_final_estimator = LogisticRegression(class_weight=class_weight,
3526
4168
  random_state=random_state,
@@ -3530,26 +4172,25 @@ def predict(
3530
4172
  print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3531
4173
  #! apply stacking
3532
4174
  if purpose == "classification":
3533
- print(f" ⤵ StackingClassifier...")
4175
+ print(f"⤵ StackingClassifier...")
3534
4176
  stacking_model = StackingClassifier(estimators=base_estimators,
3535
4177
  final_estimator=best_final_estimator,
3536
4178
  cv=cv)
3537
4179
  else:
3538
- print(f" ⤵ StackingRegressor...")
4180
+ print(f"⤵ StackingRegressor...")
3539
4181
  stacking_model = StackingRegressor(estimators=base_estimators,
3540
4182
  final_estimator=best_final_estimator,
3541
4183
  cv=cv)
3542
4184
 
3543
4185
  # Train the Stacking Classifier
3544
- print(f" ⤵ fit & predict...")
4186
+ print(f"⤵ fit & predict...")
3545
4187
  stacking_model.fit(x_train, y_train)
3546
4188
  y_pred_final = stacking_model.predict(x_true)
3547
- print(f" ⤵ collecting results...")
4189
+ print(f"⤵ collecting results...")
3548
4190
  # pred_proba
3549
4191
  if is_binary:
3550
4192
  if hasattr(stacking_model, "predict_proba"):
3551
4193
  y_pred_proba_final = stacking_model.predict_proba(x_true)
3552
- print("Shape of predicted probabilities:", y_pred_proba_final.shape)
3553
4194
  if y_pred_proba_final.shape[1] == 1:
3554
4195
  y_pred_proba_final = np.hstack(
3555
4196
  [1 - y_pred_proba_final, y_pred_proba_final]
@@ -3564,6 +4205,17 @@ def predict(
3564
4205
  )
3565
4206
  else:
3566
4207
  y_pred_proba_final = None # No probability output for certain models
4208
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
4209
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
4210
+ if hasattr(best_clf, "alphas_"):
4211
+ alphas_ = best_clf.alphas_
4212
+ elif hasattr(best_clf, "alpha_"):
4213
+ alphas_ = best_clf.alpha_
4214
+ elif hasattr(best_clf, "Cs_"):
4215
+ alphas_ = best_clf.Cs_
4216
+ else:
4217
+ alphas_= None
4218
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3567
4219
  if not is_binary:
3568
4220
  # Handle prediction probabilities for multiclass
3569
4221
  if hasattr(stacking_model, "predict_proba"):
@@ -3581,6 +4233,17 @@ def predict(
3581
4233
  )
3582
4234
  else:
3583
4235
  y_pred_proba_final = None # No probability output for certain models
4236
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
4237
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
4238
+ if hasattr(best_clf, "alphas_"):
4239
+ alphas_ = best_clf.alphas_
4240
+ elif hasattr(best_clf, "alpha_"):
4241
+ alphas_ = best_clf.alpha_
4242
+ elif hasattr(best_clf, "Cs_"):
4243
+ alphas_ = best_clf.Cs_
4244
+ else:
4245
+ alphas_= None
4246
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3584
4247
  #! dict_pred_stack
3585
4248
  dict_pred_stack={}
3586
4249
  validation_scores_final = {}
@@ -3631,6 +4294,9 @@ def predict(
3631
4294
  "predictions_proba": (
3632
4295
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3633
4296
  ),
4297
+ "features":share_col_names,
4298
+ "coef":coef_,
4299
+ "alphas":alphas_
3634
4300
  }
3635
4301
  else: # "regression"
3636
4302
  dict_pred_stack = {
@@ -3641,6 +4307,9 @@ def predict(
3641
4307
  "predictions_proba": (
3642
4308
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3643
4309
  ),
4310
+ "features":share_col_names,
4311
+ "coef":coef_,
4312
+ "alphas":alphas_
3644
4313
  }
3645
4314
  else: # multi-classes
3646
4315
  if y_pred_proba_final is not None:
@@ -3680,6 +4349,9 @@ def predict(
3680
4349
  "predictions_proba": (
3681
4350
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3682
4351
  ),
4352
+ "features":share_col_names,
4353
+ "coef":coef_,
4354
+ "alphas":alphas_
3683
4355
  }
3684
4356
  else: # "regression"
3685
4357
  dict_pred_stack = {
@@ -3690,6 +4362,9 @@ def predict(
3690
4362
  "predictions_proba": (
3691
4363
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3692
4364
  ),
4365
+ "features":share_col_names,
4366
+ "coef":coef_,
4367
+ "alphas":alphas_
3693
4368
  }
3694
4369
 
3695
4370
  else:
@@ -3712,8 +4387,11 @@ def predict(
3712
4387
  "predictions_proba": (
3713
4388
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3714
4389
  ),
4390
+ "features":share_col_names,
3715
4391
  "y_train": y_train if y_train is not None else [],
3716
4392
  "y_true": y_true if y_true is not None else [],
4393
+ "coef":coef_,
4394
+ "alphas":alphas_
3717
4395
  }
3718
4396
  # merge together
3719
4397
  df_pred = pd.DataFrame(
@@ -3728,16 +4406,16 @@ def predict(
3728
4406
  # if dir_save:
3729
4407
  # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3730
4408
  if vote:
3731
- print(f" ⤵ voting...")
4409
+ print(f"⤵ voting...")
3732
4410
  from sklearn.ensemble import VotingClassifier, VotingRegressor
3733
- #! Votting
4411
+ #! voting
3734
4412
  n_top_models = min(n_top_models, df_results.shape[0])
3735
4413
  base_estimators=[]
3736
4414
  for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3737
4415
  base_estimators.append((name,cls))
3738
4416
  # Apply Voting Classifier/Regressor
3739
4417
  if purpose == "classification":
3740
- print(f" ⤵ VotingClassifier...via{votting}")
4418
+ print(f"⤵ VotingClassifier...via{voting}")
3741
4419
  if voting=='hard':
3742
4420
  # Hard voting does not support `predict_proba`
3743
4421
  voting_model = VotingClassifier(estimators=base_estimators)
@@ -3745,7 +4423,7 @@ def predict(
3745
4423
  # Soft voting supports `predict_proba`
3746
4424
  voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3747
4425
  else:
3748
- print(f" ⤵ VotingRegressor...")
4426
+ print(f"⤵ VotingRegressor...")
3749
4427
  voting_model = VotingRegressor(estimators=base_estimators)
3750
4428
 
3751
4429
  # Train the Voting Classifier/Regressor
@@ -3770,10 +4448,23 @@ def predict(
3770
4448
  y_pred_proba_vote = y_pred_proba_vote[:, 1]
3771
4449
  else:
3772
4450
  y_pred_proba_vote = None
4451
+
4452
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
4453
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
4454
+ if hasattr(best_clf, "alphas_"):
4455
+ alphas_ = best_clf.alphas_
4456
+ elif hasattr(best_clf, "alpha_"):
4457
+ alphas_ = best_clf.alpha_
4458
+ elif hasattr(best_clf, "Cs_"):
4459
+ alphas_ = best_clf.Cs_
4460
+ else:
4461
+ alphas_= None
4462
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3773
4463
  else: # Regression
3774
4464
  y_pred_proba_vote = None
4465
+ coef_,alphas_=None,None
3775
4466
 
3776
- print(f" ⤵ collecting voting results...")
4467
+ print(f"⤵ collecting voting results...")
3777
4468
  #! dict_pred_vote
3778
4469
  dict_pred_vote = {}
3779
4470
  validation_scores_vote = {}
@@ -3822,6 +4513,9 @@ def predict(
3822
4513
  "predictions_proba": (
3823
4514
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3824
4515
  ),
4516
+ "features":share_col_names,
4517
+ "coef":coef_,
4518
+ "alphas":alphas_
3825
4519
  }
3826
4520
  else: # Multi-class
3827
4521
  if y_pred_proba_vote is not None:
@@ -3856,6 +4550,9 @@ def predict(
3856
4550
  "predictions_proba": (
3857
4551
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3858
4552
  ),
4553
+ "features":share_col_names,
4554
+ "coef":coef_,
4555
+ "alphas":alphas_
3859
4556
  }
3860
4557
  else:
3861
4558
  if y_true is None:
@@ -3877,6 +4574,7 @@ def predict(
3877
4574
  "predictions_proba": (
3878
4575
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3879
4576
  ),
4577
+ "features":share_col_names,
3880
4578
  "y_train": y_train if y_train is not None else [],
3881
4579
  "y_true": y_true if y_true is not None else [],
3882
4580
  }
@@ -3900,6 +4598,8 @@ def predict(
3900
4598
  df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
3901
4599
  elif stack:
3902
4600
  df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
4601
+ else:
4602
+ df_res=df_results
3903
4603
 
3904
4604
  if all([plot_, y_true is not None, purpose == "classification"]):
3905
4605
  from datetime import datetime
@@ -3907,9 +4607,15 @@ def predict(
3907
4607
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3908
4608
  # try:
3909
4609
  if df_res.shape[0] > 3:
3910
- plot_validate_features(df_res, is_binary=is_binary)
4610
+ try:
4611
+ plot_validate_features(df_res, is_binary=is_binary)
4612
+ except Exception as e:
4613
+ print(e)
3911
4614
  else:
3912
- plot_validate_features_single(df_res, is_binary=is_binary)
4615
+ try:
4616
+ plot_validate_features_single(df_res, is_binary=is_binary)
4617
+ except Exception as e:
4618
+ print(e)
3913
4619
  if dir_save:
3914
4620
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3915
4621
  # except Exception as e: