py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
31
31
  average_precision_score,
32
32
  )
33
33
  from typing import Dict, Any, Optional, List, Union
34
+ import os, json
34
35
  import numpy as np
35
36
  import pandas as pd
36
37
  from . import ips
@@ -49,7 +50,13 @@ logger = logging.getLogger()
49
50
  warnings.filterwarnings("ignore", category=UserWarning)
50
51
  from sklearn.tree import DecisionTreeClassifier
51
52
  from sklearn.neighbors import KNeighborsClassifier
52
-
53
+ #* set random_state global
54
+ import torch
55
+ import random
56
+ random_state=1
57
+ random.seed(random_state)
58
+ np.random.seed(random_state)
59
+ torch.manual_seed(random_state)
53
60
 
54
61
  def features_knn(
55
62
  x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
@@ -594,7 +601,7 @@ def get_features(
594
601
  """
595
602
  from sklearn.compose import ColumnTransformer
596
603
  from sklearn.preprocessing import StandardScaler, OneHotEncoder
597
-
604
+ from sklearn.model_selection import train_test_split
598
605
  # Ensure X and y are DataFrames/Series for consistency
599
606
  if isinstance(X, np.ndarray):
600
607
  X = pd.DataFrame(X)
@@ -922,10 +929,26 @@ def get_features(
922
929
  "feature_importances": feature_importances,
923
930
  }
924
931
  if all([plot_, dir_save]):
932
+
925
933
  from datetime import datetime
926
-
927
934
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
928
935
  ips.figsave(dir_save + f"features{now_}.pdf")
936
+
937
+ lists = []
938
+ for tp in ips.flatten(features_df["type"]):
939
+ lists.append(
940
+ features_df
941
+ .loc[features_df["type"] == tp, "feature"]
942
+ .tolist()
943
+ )
944
+ labels = ips.flatten(features_df["type"])
945
+ # current_fig = plt.gcf()
946
+ # # ax = current_fig.add_subplot(3, 2, 6)
947
+ # gs = current_fig.add_gridspec(3, 2)
948
+ # ax = current_fig.add_subplot(gs[:, :])
949
+ plt.figure(figsize=[6,6])
950
+ plot.venn(lists, labels, cmap="coolwarm")
951
+ ips.figsave(dir_save + f"features{now_}shared_features.pdf")
929
952
  else:
930
953
  results = {
931
954
  "selected_features": pd.DataFrame(),
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1247
1270
  nexttile = plot.subplot(figsize=figsize)
1248
1271
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1249
1272
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1250
- fpr = res_val["roc_curve"][model_name]["fpr"]
1251
- tpr = res_val["roc_curve"][model_name]["tpr"]
1252
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1253
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1254
- plot_roc_curve(
1255
- fpr,
1256
- tpr,
1257
- mean_auc,
1258
- lower_ci,
1259
- upper_ci,
1260
- model_name=model_name,
1261
- lw=1.5,
1262
- color=colors[i],
1263
- alpha=alpha,
1264
- ax=ax,
1265
- )
1273
+ try:
1274
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1275
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1276
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1277
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1278
+ plot_roc_curve(
1279
+ fpr,
1280
+ tpr,
1281
+ mean_auc,
1282
+ lower_ci,
1283
+ upper_ci,
1284
+ model_name=model_name,
1285
+ lw=1.5,
1286
+ color=colors[i],
1287
+ alpha=alpha,
1288
+ ax=ax,
1289
+ )
1290
+ except Exception as e:
1291
+ print(e)
1266
1292
  plot.figsets(
1267
1293
  sp=2,
1268
1294
  legend=dict(
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1277
1303
 
1278
1304
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1279
1305
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1280
- plot_pr_curve(
1281
- recall=res_val["pr_curve"][model_name]["recall"],
1282
- precision=res_val["pr_curve"][model_name]["precision"],
1283
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1284
- model_name=model_name,
1285
- color=colors[i],
1286
- lw=1.5,
1287
- alpha=alpha,
1288
- ax=ax,
1289
- )
1306
+ try:
1307
+ plot_pr_curve(
1308
+ recall=res_val["pr_curve"][model_name]["recall"],
1309
+ precision=res_val["pr_curve"][model_name]["precision"],
1310
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1311
+ model_name=model_name,
1312
+ color=colors[i],
1313
+ lw=1.5,
1314
+ alpha=alpha,
1315
+ ax=ax,
1316
+ )
1317
+ except Exception as e:
1318
+ print(e)
1290
1319
  plot.figsets(
1291
1320
  sp=2,
1292
1321
  legend=dict(
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1314
1343
  for iclass, class_ in enumerate(classes):
1315
1344
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1316
1345
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1317
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1318
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1319
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1320
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1321
- plot_roc_curve(
1322
- fpr,
1323
- tpr,
1324
- mean_auc,
1325
- lower_ci,
1326
- upper_ci,
1327
- model_name=model_name,
1328
- lw=1.5,
1329
- color=colors[i],
1330
- alpha=alpha,
1331
- ax=ax,
1332
- )
1346
+ try:
1347
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1348
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1349
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1350
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1351
+ plot_roc_curve(
1352
+ fpr,
1353
+ tpr,
1354
+ mean_auc,
1355
+ lower_ci,
1356
+ upper_ci,
1357
+ model_name=model_name,
1358
+ lw=1.5,
1359
+ color=colors[i],
1360
+ alpha=alpha,
1361
+ ax=ax,
1362
+ )
1363
+ except Exception as e:
1364
+ print(e)
1333
1365
  plot.figsets(
1334
1366
  sp=2,
1335
1367
  title=class_,
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1345
1377
 
1346
1378
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1347
1379
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1348
- plot_pr_curve(
1349
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1350
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1351
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1352
- iclass
1353
- ],
1354
- model_name=model_name,
1355
- color=colors[i],
1356
- lw=1.5,
1357
- alpha=alpha,
1358
- ax=ax,
1359
- )
1380
+ try:
1381
+ plot_pr_curve(
1382
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1383
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1384
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1385
+ iclass
1386
+ ],
1387
+ model_name=model_name,
1388
+ color=colors[i],
1389
+ lw=1.5,
1390
+ alpha=alpha,
1391
+ ax=ax,
1392
+ )
1393
+ except Exception as e:
1394
+ print(e)
1360
1395
  plot.figsets(
1361
1396
  sp=2,
1362
1397
  title=class_,
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1379
1414
  len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
1380
1415
  )
1381
1416
  for model_name in ips.flatten(res_val["pr_curve"].index):
1382
- fpr = res_val["roc_curve"][model_name]["fpr"]
1383
- tpr = res_val["roc_curve"][model_name]["tpr"]
1384
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1385
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1386
-
1387
- # Plotting
1388
- plot_roc_curve(
1389
- fpr,
1390
- tpr,
1391
- mean_auc,
1392
- lower_ci,
1393
- upper_ci,
1394
- model_name=model_name,
1395
- ax=nexttile(),
1396
- )
1397
- plot.figsets(title=model_name, sp=2)
1417
+ try:
1418
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1419
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1420
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1421
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1398
1422
 
1399
- plot_pr_binary(
1400
- recall=res_val["pr_curve"][model_name]["recall"],
1401
- precision=res_val["pr_curve"][model_name]["precision"],
1402
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1403
- model_name=model_name,
1404
- ax=nexttile(),
1405
- )
1406
- plot.figsets(title=model_name, sp=2)
1423
+ # Plotting
1424
+ plot_roc_curve(
1425
+ fpr,
1426
+ tpr,
1427
+ mean_auc,
1428
+ lower_ci,
1429
+ upper_ci,
1430
+ model_name=model_name,
1431
+ ax=nexttile(),
1432
+ )
1433
+ plot.figsets(title=model_name, sp=2)
1407
1434
 
1408
- # plot cm
1409
- plot_cm(
1410
- res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1411
- )
1412
- plot.figsets(title=model_name, sp=2)
1435
+ plot_pr_binary(
1436
+ recall=res_val["pr_curve"][model_name]["recall"],
1437
+ precision=res_val["pr_curve"][model_name]["precision"],
1438
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1439
+ model_name=model_name,
1440
+ ax=nexttile(),
1441
+ )
1442
+ plot.figsets(title=model_name, sp=2)
1443
+
1444
+ # plot cm
1445
+ plot_cm(
1446
+ res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1447
+ )
1448
+ plot.figsets(title=model_name, sp=2)
1449
+
1450
+ except Exception as e:
1451
+ print(e)
1413
1452
  else:
1414
1453
 
1415
1454
  modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1424
1463
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1425
1464
  ax = nexttile()
1426
1465
  for iclass, class_ in enumerate(classes):
1427
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1428
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1429
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1430
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1431
- plot_roc_curve(
1432
- fpr,
1433
- tpr,
1434
- mean_auc,
1435
- lower_ci,
1436
- upper_ci,
1437
- model_name=class_,
1438
- lw=1.5,
1439
- color=colors[iclass],
1440
- alpha=0.03,
1441
- ax=ax,
1442
- )
1466
+ try:
1467
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1468
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1469
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1470
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1471
+ plot_roc_curve(
1472
+ fpr,
1473
+ tpr,
1474
+ mean_auc,
1475
+ lower_ci,
1476
+ upper_ci,
1477
+ model_name=class_,
1478
+ lw=1.5,
1479
+ color=colors[iclass],
1480
+ alpha=0.03,
1481
+ ax=ax,
1482
+ )
1483
+ except Exception as e:
1484
+ print(e)
1443
1485
  plot.figsets(
1444
1486
  sp=2,
1445
1487
  title=model_name,
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1451
1493
 
1452
1494
  ax = nexttile()
1453
1495
  for iclass, class_ in enumerate(classes):
1454
- plot_pr_curve(
1455
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1456
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1457
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1458
- iclass
1459
- ],
1460
- model_name=class_,
1461
- color=colors[iclass],
1462
- lw=1.5,
1463
- alpha=0.03,
1464
- ax=ax,
1465
- )
1496
+ try:
1497
+ plot_pr_curve(
1498
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1499
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1500
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1501
+ iclass
1502
+ ],
1503
+ model_name=class_,
1504
+ color=colors[iclass],
1505
+ lw=1.5,
1506
+ alpha=0.03,
1507
+ ax=ax,
1508
+ )
1509
+ except Exception as e:
1510
+ print(e)
1466
1511
  plot.figsets(
1467
1512
  sp=2,
1468
1513
  title=class_,
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
1543
1588
  # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
1544
1589
  sorted_scores = np.array(bootstrapped_scores)
1545
1590
  sorted_scores.sort()
1546
-
1547
- # Computing the lower and upper bound of the 90% confidence interval
1548
- # You can change the bounds percentiles to 0.025 and 0.975 to get
1549
- # a 95% confidence interval instead.
1591
+
1550
1592
  confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1551
1593
  confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1552
1594
  if verbose:
1553
1595
  print(
1554
- "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
1596
+ "Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
1555
1597
  confidence_lower, confidence_upper
1556
1598
  )
1557
1599
  )
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
1568
1610
  y_true, classes=np.unique(y_true)
1569
1611
  ) # One-vs-Rest transformation
1570
1612
  n_classes = y_true_bin.shape[1] # Number of classes
1571
-
1572
- bootstrapped_scores = np.zeros(
1573
- (n_classes, n_bootstraps)
1574
- ) # Store scores for each class
1575
-
1613
+
1614
+ bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
1576
1615
  if verbose:
1577
1616
  print("AUROC scores for each class:")
1578
1617
  for i in range(n_classes):
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
1592
1631
  # Calculating the confidence intervals for each class
1593
1632
  confidence_intervals = []
1594
1633
  for class_idx in range(n_classes):
1595
- sorted_scores = np.sort(bootstrapped_scores[class_idx])
1596
- confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1597
- confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1598
- confidence_intervals.append((confidence_lower, confidence_upper))
1599
-
1600
- if verbose:
1601
- print(
1602
- f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1603
- )
1634
+ # rm nan
1635
+ valid_scores = bootstrapped_scores[class_idx][
1636
+ ~np.isnan(bootstrapped_scores[class_idx])
1637
+ ]
1638
+ if len(valid_scores) > 0:
1639
+ sorted_scores = np.sort(valid_scores)
1640
+ confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1641
+ confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1642
+ confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
1643
+
1644
+ if verbose:
1645
+ print(
1646
+ f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1647
+ )
1648
+ else:
1649
+ confidence_intervals[class_idx] = (np.nan, np.nan)
1650
+ if verbose:
1651
+ print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
1604
1652
 
1605
1653
  return confidence_intervals
1606
1654
 
@@ -2057,20 +2105,20 @@ def rank_models(
2057
2105
 
2058
2106
  def generate_bar_plot(ax, cv_test_scores):
2059
2107
  ax = plot.plotxy(
2060
- y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
2108
+ y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
2061
2109
  )
2062
2110
  plt.title("Classifier Performance")
2063
2111
  plt.tight_layout()
2064
2112
  return plt
2065
2113
 
2066
- nexttile = plot.subplot(2, 2, figsize=[10, 7])
2114
+ nexttile = plot.subplot(2, 2, figsize=[10, 10])
2067
2115
  generate_bar_plot(nexttile(), top_models.dropna())
2068
2116
  plot.radar(
2069
2117
  ax=nexttile(projection="polar"),
2070
2118
  data=cv_test_scores.set_index("Classifier"),
2071
- ylim=[0.5, 1],
2072
- color=plot.get_color(10),
2073
- alpha=0.05,
2119
+ ylim=[0, 1],
2120
+ color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
2121
+ alpha=0.02,
2074
2122
  circular=1,
2075
2123
  )
2076
2124
  return cv_test_scores
@@ -2206,6 +2254,8 @@ def predict(
2206
2254
  y_train: pd.Series,
2207
2255
  x_true: pd.DataFrame = None,
2208
2256
  y_true: Optional[pd.Series] = None,
2257
+ fill_missing:bool = True,
2258
+ scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
2209
2259
  backward: bool = False, # backward_regression
2210
2260
  backward_thr:float = 0.05,# pval thr,only works when backward is True
2211
2261
  common_features: set = None,
@@ -2214,7 +2264,7 @@ def predict(
2214
2264
  metrics: Optional[List[str]] = None,
2215
2265
  stack:bool=True,# run stacking
2216
2266
  stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2217
- vote:bool=True,# run voting
2267
+ vote:bool=False,# run voting
2218
2268
  voting:str="hard", # only for classification purporse of voting
2219
2269
  n_top_models:int=5, #for stacking models
2220
2270
  n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
@@ -2227,7 +2277,12 @@ def predict(
2227
2277
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2228
2278
  class_weight: str = "balanced",
2229
2279
  random_state: int = 1,
2280
+ presets = "best_quality",# specific for autogluon
2281
+ time_limit=600, # specific for autogluon
2282
+ num_bag_folds=5, # specific for autogluon
2283
+ num_stack_levels=2, # specific for autogluon
2230
2284
  verbose: bool = False,
2285
+ **kwargs
2231
2286
  ) -> pd.DataFrame:
2232
2287
  """
2233
2288
  第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
@@ -2278,28 +2333,20 @@ def predict(
2278
2333
  RandomForestRegressor,
2279
2334
  ExtraTreesClassifier,
2280
2335
  ExtraTreesRegressor,
2336
+ HistGradientBoostingRegressor,
2281
2337
  BaggingClassifier,
2282
2338
  BaggingRegressor,
2283
2339
  AdaBoostClassifier,
2284
2340
  AdaBoostRegressor,
2285
2341
  )
2286
- from sklearn.svm import SVC, SVR
2287
- from sklearn.tree import DecisionTreeRegressor
2342
+ from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
2343
+ from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
2288
2344
  from sklearn.linear_model import (
2289
- LogisticRegression,
2290
- ElasticNet,
2291
- ElasticNetCV,
2292
- LinearRegression,
2293
- Lasso,
2294
- RidgeClassifierCV,
2295
- Perceptron,
2296
- SGDClassifier,
2297
- RidgeCV,
2298
- Ridge,
2299
- TheilSenRegressor,
2300
- HuberRegressor,
2301
- PoissonRegressor,
2302
-
2345
+ LogisticRegression,ElasticNet,ElasticNetCV,
2346
+ LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
2347
+ RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
2348
+ GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
2349
+ OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
2303
2350
  )
2304
2351
  from sklearn.compose import TransformedTargetRegressor
2305
2352
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
@@ -2316,15 +2363,21 @@ def predict(
2316
2363
  )
2317
2364
  from sklearn.preprocessing import PolynomialFeatures
2318
2365
  from sklearn.model_selection import train_test_split
2319
-
2366
+
2367
+ from sklearn.gaussian_process import GaussianProcessRegressor
2368
+ from sklearn.kernel_ridge import KernelRidge
2369
+ from sklearn.dummy import DummyRegressor
2370
+ from autogluon.tabular import TabularPredictor
2320
2371
  # 拼写检查
2321
2372
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
2322
2373
  print(f"{purpose} processing...")
2374
+
2375
+
2323
2376
  # Default models or regressors if not provided
2324
2377
  if purpose == "classification":
2325
2378
  model_ = {
2326
2379
  "Random Forest": RandomForestClassifier(
2327
- random_state=random_state, class_weight=class_weight
2380
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2328
2381
  ),
2329
2382
  # SVC (Support Vector Classification)
2330
2383
  "SVM": SVC(
@@ -2335,7 +2388,7 @@ def predict(
2335
2388
  ),
2336
2389
  # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
2337
2390
  "Logistic Regression": LogisticRegression(
2338
- class_weight=class_weight, random_state=random_state
2391
+ class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
2339
2392
  ),
2340
2393
  # Logistic Regression with L1 Regularization (Lasso)
2341
2394
  "Lasso Logistic Regression": LogisticRegression(
@@ -2346,51 +2399,70 @@ def predict(
2346
2399
  eval_metric="logloss",
2347
2400
  random_state=random_state,
2348
2401
  ),
2349
- "KNN": KNeighborsClassifier(n_neighbors=5),
2402
+ "KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
2350
2403
  "Naive Bayes": GaussianNB(),
2351
2404
  "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
2352
2405
  "AdaBoost": AdaBoostClassifier(
2353
2406
  algorithm="SAMME", random_state=random_state
2354
2407
  ),
2355
- # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
2408
+ "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
2356
2409
  "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
2357
2410
  "Extra Trees": ExtraTreesClassifier(
2358
- random_state=random_state, class_weight=class_weight
2411
+ random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
2359
2412
  ),
2360
- "Bagging": BaggingClassifier(random_state=random_state),
2413
+ "Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
2361
2414
  "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
2362
2415
  "DecisionTree": DecisionTreeClassifier(),
2363
2416
  "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
2364
2417
  "Ridge": RidgeClassifierCV(
2365
2418
  class_weight=class_weight, store_cv_results=True
2366
2419
  ),
2367
- "Perceptron": Perceptron(random_state=random_state),
2420
+ "Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
2368
2421
  "Bernoulli Naive Bayes": BernoulliNB(),
2369
- "SGDClassifier": SGDClassifier(random_state=random_state),
2422
+ "SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
2370
2423
  }
2371
2424
  elif purpose == "regression":
2372
2425
  model_ = {
2373
- "Random Forest": RandomForestRegressor(random_state=random_state),
2426
+ "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2374
2427
  "SVM": SVR(), # SVR (Support Vector Regression)
2375
- # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2376
- "LassoCV": LassoCV(
2377
- cv=cv_folds, random_state=random_state
2378
- ), # LassoCV自动找出最适alpha,优于Lasso
2428
+ "LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
2379
2429
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2380
- "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
2381
- "Linear Regression": LinearRegression(),
2430
+ "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2431
+ "Linear Regression": LinearRegression(n_jobs=n_jobs),
2382
2432
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2383
- # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
2433
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
2384
2434
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2385
- "Extra Trees": ExtraTreesRegressor(random_state=random_state),
2386
- "Bagging": BaggingRegressor(random_state=random_state),
2435
+ "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2436
+ "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2387
2437
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2388
2438
  "ElasticNet": ElasticNet(random_state=random_state),
2389
- "Ridge": Ridge(),
2390
- "KNN": KNeighborsRegressor(),
2391
- "TheilSen":TheilSenRegressor(),
2439
+ "Ridge": Ridge(random_state=random_state),
2440
+ "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2441
+ "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2392
2442
  "Huber":HuberRegressor(),
2393
- "Poisson":PoissonRegressor()
2443
+ "Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
2444
+ "Lasso": Lasso(random_state=random_state),
2445
+ "Lars": Lars(),
2446
+ "LassoLars": LassoLars(),
2447
+ "BayesianRidge": BayesianRidge(),
2448
+ "GammaRegressor": GammaRegressor(),
2449
+ "TweedieRegressor": TweedieRegressor(),
2450
+ "LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
2451
+ "ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
2452
+ "LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
2453
+ "LarsCV": LarsCV(),
2454
+ "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
2455
+ "OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
2456
+ "PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
2457
+ "LinearSVR": LinearSVR(random_state=random_state),
2458
+ "NuSVR": NuSVR(),
2459
+ "DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
2460
+ "ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
2461
+ "HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
2462
+ "GaussianProcessRegressor": GaussianProcessRegressor(),
2463
+ "KernelRidge": KernelRidge(),
2464
+ "DummyRegressor": DummyRegressor(),
2465
+ "TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
2394
2466
  }
2395
2467
  if cls is None:
2396
2468
  models = model_
@@ -2407,10 +2479,17 @@ def predict(
2407
2479
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2408
2480
  )
2409
2481
 
2482
+ # only keep "autogluon_tab" in models
2483
+ cls = [cls] if isinstance(cls, str) else cls
2484
+
2485
+ if cls is not None:
2486
+ models={"autogluon_tab":None} if "auto" in cls else models
2487
+
2410
2488
  # indicate cls:
2411
2489
  if ips.run_once_within(30): # 10 min
2412
2490
  print(f"processing: {list(models.keys())}")
2413
-
2491
+ y_train_col_name=None
2492
+ # print(isinstance(y_train, str) and y_train in x_train.columns)
2414
2493
  if isinstance(y_train, str) and y_train in x_train.columns:
2415
2494
  y_train_col_name = y_train
2416
2495
  y_train = x_train[y_train]
@@ -2418,6 +2497,7 @@ def predict(
2418
2497
  x_train = x_train.drop(y_train_col_name, axis=1)
2419
2498
  # else:
2420
2499
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2500
+
2421
2501
  y_train = pd.DataFrame(y_train)
2422
2502
  if y_train.select_dtypes(include=np.number).empty:
2423
2503
  y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
@@ -2430,6 +2510,9 @@ def predict(
2430
2510
  y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2431
2511
  print("is_binary:", is_binary)
2432
2512
 
2513
+ if fill_missing:
2514
+ ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
2515
+ ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
2433
2516
  # Perform backward feature selection
2434
2517
  if backward:
2435
2518
  selected_features = backward_regression(x_train, y_train, thr=backward_thr)
@@ -2458,6 +2541,8 @@ def predict(
2458
2541
  pd.DataFrame(y_train), method="label"
2459
2542
  ).values.ravel()
2460
2543
 
2544
+ if fill_missing:
2545
+ ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
2461
2546
  if y_true is not None:
2462
2547
  if isinstance(y_true, str) and y_true in x_true.columns:
2463
2548
  y_true_col_name = y_true
@@ -2490,11 +2575,16 @@ def predict(
2490
2575
  # Ensure common features are selected
2491
2576
  if common_features is not None:
2492
2577
  x_train, x_true = x_train[common_features], x_true[common_features]
2578
+ share_col_names=common_features
2493
2579
  else:
2494
2580
  share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
2495
2581
  x_train, x_true = x_train[share_col_names], x_true[share_col_names]
2496
2582
 
2497
- x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
2583
+ #! scaler
2584
+ # scaler and fit x_train and export scaler to fit the x_true
2585
+ x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
2586
+ #
2587
+ x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
2498
2588
  x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
2499
2589
  x_true, method="dummy"
2500
2590
  )
@@ -2516,18 +2606,261 @@ def predict(
2516
2606
  if isinstance(y_train, np.ndarray):
2517
2607
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2518
2608
  y_true = np.asarray(y_true)
2519
- # Hyperparameter grids for tuning
2520
- param_grid_common_xgb = {
2521
- 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2522
- 'max_depth': [3, 5, 7, 10],
2523
- 'n_estimators': [50, 100, 200, 300],
2524
- 'subsample': [0.6, 0.8, 1.0],
2525
- 'colsample_bytree': [0.6, 0.8, 1.0],
2526
- 'gamma': [0, 0.1, 0.2, 0.5],
2527
- 'min_child_weight': [1, 5, 10],
2528
- 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2529
- 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2609
+ #! so far, got the: x_train,x_true,y_train,y_true
2610
+ # Grid search with KFold or StratifiedKFold
2611
+ if "autogluon_tab" in models:
2612
+ # load hypoer_param
2613
+ f_param = os.path.dirname(os.path.abspath(__file__))
2614
+ f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
2615
+ with open(f_param, "r") as file:
2616
+ hyper_param_autogluon = json.load(file)
2617
+ # Train the model with AutoGluon
2618
+ features=x_train.columns.tolist()
2619
+ label= y_train_col_name if y_train_col_name is not None else 'target'
2620
+ df_autogluon = x_train.copy()
2621
+ df_autogluon[label]=y_train
2622
+ autogluon_presets=["best_quality","good_quality","fast_train"]
2623
+ best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
2624
+ train_data=df_autogluon,
2625
+ presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
2626
+ time_limit=time_limit,#3600, # in sec: Limit training time,
2627
+ num_bag_folds=num_bag_folds,
2628
+ num_stack_levels=num_stack_levels,
2629
+ hyperparameters=hyper_param_autogluon,
2630
+ verbosity=1 if verbose else 0,
2631
+ **kwargs
2632
+ )
2633
+ #! Get the leaderboard
2634
+ gs={}
2635
+ # Display the leaderboard for reference
2636
+ leaderboard = best_clf.leaderboard()
2637
+ gs['info']=best_clf.info()
2638
+ # gs["res"]=best_clf
2639
+ gs["features"]=features
2640
+ gs["leaderboard"] = leaderboard
2641
+ best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
2642
+ # Store the best model and its details in the gs dictionary
2643
+ gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
2644
+ gs["best_params_"] = best_model_name # Hyperparameters
2645
+ # Make predictions if x_true is provided
2646
+ if x_true is not None:
2647
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2648
+ gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
2649
+ gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
2650
+ x_true[label]=gs["predictions"]
2651
+ if gs["predictions"].value_counts().shape[0]>1:
2652
+ gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
2653
+ gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
2654
+ all_models = gs["models"]
2655
+ model_evaluations = {}
2656
+ for model in all_models:
2657
+ predictions = best_clf.predict(x_true[features], model=model)
2658
+ evaluation = best_clf.evaluate_predictions(
2659
+ y_true=x_true[label], # True labels
2660
+ y_pred=predictions, # Predictions from the specific model
2661
+ auxiliary_metrics=True, # Include additional metrics if needed
2662
+ )
2663
+ model_evaluations[model] = evaluation
2664
+ gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
2665
+ #! 试着保持一样的格式
2666
+ results = {}
2667
+ for model in all_models:
2668
+ y_pred = best_clf.predict(x_true[features], model=model).tolist()
2669
+ y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
2670
+
2671
+ if isinstance(y_pred_proba, pd.DataFrame):
2672
+ y_pred_proba=y_pred_proba.iloc[:,1]
2673
+
2674
+ # try to make predict format consistant
2675
+ try:
2676
+ y_pred= [i[0] for i in y_pred]
2677
+ except:
2678
+ pass
2679
+ try:
2680
+ y_true= [i[0] for i in y_true]
2681
+ except:
2682
+ pass
2683
+ try:
2684
+ y_train= [i[0] for i in y_train]
2685
+ except:
2686
+ pass
2687
+ validation_scores = {}
2688
+ if y_true is not None and y_pred_proba is not None:
2689
+ validation_scores = cal_metrics(
2690
+ y_true,
2691
+ y_pred,
2692
+ y_pred_proba=y_pred_proba,
2693
+ is_binary=is_binary,
2694
+ purpose=purpose,
2695
+ average="weighted",
2696
+ )
2697
+ if is_binary:
2698
+ # Calculate ROC curve
2699
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
2700
+ if y_pred_proba is not None:
2701
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2702
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2703
+ lower_ci, upper_ci = cal_auc_ci(
2704
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2705
+ )
2706
+ roc_auc = auc(fpr, tpr)
2707
+ roc_info = {
2708
+ "fpr": fpr.tolist(),
2709
+ "tpr": tpr.tolist(),
2710
+ "auc": roc_auc,
2711
+ "ci95": (lower_ci, upper_ci),
2712
+ }
2713
+ # precision-recall curve
2714
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
2715
+ avg_precision_ = average_precision_score(y_true, y_pred_proba)
2716
+ pr_info = {
2717
+ "precision": precision_,
2718
+ "recall": recall_,
2719
+ "avg_precision": avg_precision_,
2720
+ }
2721
+ else:
2722
+ roc_info, pr_info = None, None
2723
+ if purpose == "classification":
2724
+ results[model] = {
2725
+ # "best_clf": gs.best_estimator_,
2726
+ # "best_params": gs.best_params_,
2727
+ # "auc_indiv": [
2728
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2729
+ # for i in range(cv_folds)
2730
+ # ],
2731
+ "scores": validation_scores,
2732
+ "roc_curve": roc_info,
2733
+ "pr_curve": pr_info,
2734
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2735
+ "predictions": y_pred,#.tolist(),
2736
+ "predictions_proba": (
2737
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2738
+ ),
2739
+ "features":features,
2740
+ # "coef":coef_,
2741
+ # "alphas":alphas_
2742
+ }
2743
+ else: # "regression"
2744
+ results[model] = {
2745
+ # "best_clf": gs.best_estimator_,
2746
+ # "best_params": gs.best_params_,
2747
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2748
+ "predictions": y_pred,#.tolist(),
2749
+ "predictions_proba": (
2750
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2751
+ ),
2752
+ "features":features,
2753
+ # "coef":coef_,
2754
+ # "alphas":alphas_
2755
+ }
2756
+ else: # multi-classes
2757
+ if y_pred_proba is not None:
2758
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2759
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2760
+ confidence_intervals = cal_auc_ci(
2761
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2762
+ )
2763
+ roc_info = {
2764
+ "fpr": validation_scores["fpr"],
2765
+ "tpr": validation_scores["tpr"],
2766
+ "auc": validation_scores["roc_auc_by_class"],
2767
+ "ci95": confidence_intervals,
2768
+ }
2769
+ # precision-recall curve
2770
+ precision_, recall_, avg_precision_ = cal_precision_recall(
2771
+ y_true, y_pred_proba, is_binary=is_binary
2772
+ )
2773
+ pr_info = {
2774
+ "precision": precision_,
2775
+ "recall": recall_,
2776
+ "avg_precision": avg_precision_,
2777
+ }
2778
+ else:
2779
+ roc_info, pr_info = None, None
2780
+
2781
+ if purpose == "classification":
2782
+ results[model] = {
2783
+ # "best_clf": gs.best_estimator_,
2784
+ # "best_params": gs.best_params_,
2785
+ # "auc_indiv": [
2786
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2787
+ # for i in range(cv_folds)
2788
+ # ],
2789
+ "scores": validation_scores,
2790
+ "roc_curve": roc_info,
2791
+ "pr_curve": pr_info,
2792
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2793
+ "predictions": y_pred,#.tolist(),
2794
+ "predictions_proba": (
2795
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2796
+ ),
2797
+ "features":features,
2798
+ # "coef":coef_,
2799
+ # "alphas":alphas_
2800
+ }
2801
+ else: # "regression"
2802
+ results[model] = {
2803
+ # "best_clf": gs.best_estimator_,
2804
+ # "best_params": gs.best_params_,
2805
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2806
+ "predictions": y_pred,#.tolist(),
2807
+ "predictions_proba": (
2808
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2809
+ ),
2810
+ "features":features,
2811
+ # "coef":coef_,
2812
+ # "alphas":alphas_
2813
+ }
2814
+
2815
+ else:
2816
+ if y_true is None:
2817
+ validation_scores = []
2818
+ else:
2819
+ validation_scores = cal_metrics(
2820
+ y_true,
2821
+ y_pred,
2822
+ y_pred_proba=y_pred_proba,
2823
+ is_binary=is_binary,
2824
+ purpose=purpose,
2825
+ average="weighted",
2826
+ )
2827
+ results[model] = {
2828
+ # "best_clf": gs.best_estimator_,
2829
+ # "best_params": gs.best_params_,
2830
+ "scores": validation_scores,
2831
+ "predictions": y_pred,#.tolist(),
2832
+ "predictions_proba": (
2833
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2834
+ ),
2835
+ "features":features,
2836
+ "y_train": y_train if y_train is not None else [],
2837
+ "y_true": y_true if y_true is not None else [],
2838
+ # "coef":coef_,
2839
+ # "alphas":alphas_
2530
2840
  }
2841
+ df_results = pd.DataFrame.from_dict(results, orient="index")
2842
+ gs['res']=df_results
2843
+
2844
+ if all([plot_, y_true is not None, purpose == "classification"]):
2845
+ from datetime import datetime
2846
+
2847
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
2848
+ # try:
2849
+ if df_results.shape[0] > 3:
2850
+ try:
2851
+ plot_validate_features(df_results, is_binary=is_binary)
2852
+ except Exception as e:
2853
+ print(e)
2854
+ else:
2855
+ try:
2856
+ plot_validate_features_single(df_results, is_binary=is_binary)
2857
+ except Exception as e:
2858
+ print(e)
2859
+ if dir_save:
2860
+ ips.figsave(dir_save + f"validate_features{now_}.pdf")
2861
+ return gs
2862
+
2863
+ #! cross_valid
2531
2864
  if cv_level in ["low", "simple", "s", "l"]:
2532
2865
  param_grids = {
2533
2866
  "Random Forest": (
@@ -2696,7 +3029,73 @@ def predict(
2696
3029
  'alpha': [0.1],
2697
3030
  'max_iter': [100],},
2698
3031
  "Poisson":{'alpha': [0.1],
2699
- 'max_iter': [100],}
3032
+ 'max_iter': [100],},
3033
+ "Lars": {"n_nonzero_coefs": [10, 50, None]},
3034
+ "LassoLars": {
3035
+ "alpha": [0.01, 0.1, 1]
3036
+ },
3037
+ "BayesianRidge": {
3038
+ "alpha_1": [1e-6, 1e-4, 1e-2],
3039
+ "lambda_1": [1e-6, 1e-4, 1e-2]
3040
+ },
3041
+ "GammaRegressor": {
3042
+ "alpha": [0.1, 1, 10]
3043
+ },
3044
+ "TweedieRegressor": {
3045
+ "alpha": [0.1, 1, 10],
3046
+ "power": [1, 1.5, 2]
3047
+ },
3048
+ "LassoCV": {
3049
+ "cv": [5]
3050
+ },
3051
+ "ElasticNetCV": {
3052
+ "l1_ratio": [0.2, 0.5, 0.8],
3053
+ "cv": [5]
3054
+ },
3055
+ "LassoLarsCV": {
3056
+ "cv": [5]
3057
+ },
3058
+ "LarsCV": {
3059
+ "cv": [5]
3060
+ },
3061
+ "OrthogonalMatchingPursuit": {
3062
+ "n_nonzero_coefs": [10, 50, None]
3063
+ },
3064
+ "OrthogonalMatchingPursuitCV": {
3065
+ "cv": [5]
3066
+ },
3067
+ "PassiveAggressiveRegressor": {
3068
+ "C": [0.1, 1, 10]
3069
+ },
3070
+ "LinearSVR": {
3071
+ "C": [0.1, 1, 10]
3072
+ },
3073
+ "NuSVR": {
3074
+ "C": [0.1, 1, 10]
3075
+ },
3076
+ "DecisionTreeRegressor": {
3077
+ "max_depth": [5, 10, None]
3078
+ },
3079
+ "ExtraTreeRegressor": {
3080
+ "max_depth": [5, 10, None]
3081
+ },
3082
+ "HistGradientBoostingRegressor": {
3083
+ "learning_rate": [0.05, 0.1, 0.2],
3084
+ "max_depth": [5, 10, None]
3085
+ },
3086
+ "GaussianProcessRegressor": {
3087
+ "alpha": [1e-5, 1e-2, 0.1]
3088
+ },
3089
+ "KernelRidge": {
3090
+ "alpha": [0.1, 1, 10],
3091
+ "kernel": ["linear", "rbf"]
3092
+ },
3093
+ "DummyRegressor": {
3094
+ "strategy": ["mean", "median"]
3095
+ },
3096
+ "TransformedTargetRegressor": {
3097
+ "regressor__fit_intercept": [True, False]
3098
+ }
2700
3099
  }
2701
3100
  elif cv_level in ["high", "advanced", "h"]:
2702
3101
  param_grids = {
@@ -2901,7 +3300,96 @@ def predict(
2901
3300
  'alpha': [0.1, 1.0, 10.0],
2902
3301
  'max_iter': [100, 200, 300],},
2903
3302
  "Poisson":{'alpha': [0.1, 1.0, 10.0],
2904
- 'max_iter': [100, 200, 300],}
3303
+ 'max_iter': [100, 200, 300],},
3304
+ "Lars": {
3305
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3306
+ },
3307
+ "LassoLars": {
3308
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3309
+ },
3310
+ "BayesianRidge": {
3311
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3312
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3313
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3314
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3315
+ },
3316
+ "GammaRegressor": {
3317
+ "alpha": [0.01, 0.1, 1, 10],
3318
+ "max_iter": [1000, 5000, 10000]
3319
+ },
3320
+ "TweedieRegressor": {
3321
+ "alpha": [0.01, 0.1, 1, 10],
3322
+ "power": [0, 1, 1.5, 2, 3]
3323
+ },
3324
+ "LassoCV": {
3325
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3326
+ "cv": [3, 5, 10]
3327
+ },
3328
+ "ElasticNetCV": {
3329
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3330
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3331
+ "cv": [3, 5, 10]
3332
+ },
3333
+ "LassoLarsCV": {
3334
+ "cv": [3, 5, 10]
3335
+ },
3336
+ "LarsCV": {
3337
+ "cv": [3, 5, 10]
3338
+ },
3339
+ "OrthogonalMatchingPursuit": {
3340
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3341
+ },
3342
+ "OrthogonalMatchingPursuitCV": {
3343
+ "cv": [3, 5, 10]
3344
+ },
3345
+ "PassiveAggressiveRegressor": {
3346
+ "C": [0.01, 0.1, 1, 10],
3347
+ "max_iter": [1000, 5000, 10000],
3348
+ "early_stopping": [True, False]
3349
+ },
3350
+ "LinearSVR": {
3351
+ "C": [0.01, 0.1, 1, 10],
3352
+ "epsilon": [0.01, 0.1, 1],
3353
+ "max_iter": [1000, 5000, 10000]
3354
+ },
3355
+ "NuSVR": {
3356
+ "C": [0.01, 0.1, 1, 10],
3357
+ "nu": [0.25, 0.5, 0.75],
3358
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3359
+ },
3360
+ "DecisionTreeRegressor": {
3361
+ "max_depth": [None, 5, 10, 20],
3362
+ "min_samples_split": [2, 5, 10],
3363
+ "min_samples_leaf": [1, 2, 4]
3364
+ },
3365
+ "ExtraTreeRegressor": {
3366
+ "max_depth": [None, 5, 10, 20],
3367
+ "min_samples_split": [2, 5, 10],
3368
+ "min_samples_leaf": [1, 2, 4]
3369
+ },
3370
+ "HistGradientBoostingRegressor": {
3371
+ "learning_rate": [0.01, 0.1, 0.2],
3372
+ "max_iter": [100, 500, 1000],
3373
+ "max_depth": [None, 5, 10, 20],
3374
+ "min_samples_leaf": [1, 2, 4]
3375
+ },
3376
+ "GaussianProcessRegressor": {
3377
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3378
+ "n_restarts_optimizer": [0, 1, 5, 10]
3379
+ },
3380
+ "KernelRidge": {
3381
+ "alpha": [0.01, 0.1, 1, 10],
3382
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3383
+ "degree": [2, 3, 4]
3384
+ },
3385
+ "DummyRegressor": {
3386
+ "strategy": ["mean", "median", "constant"],
3387
+ "constant": [0] # Only if strategy is 'constant'
3388
+ },
3389
+ "TransformedTargetRegressor": {
3390
+ # Grid for the underlying regressor, example shown for LinearRegression
3391
+ "regressor__fit_intercept": [True, False]
3392
+ }
2905
3393
  }
2906
3394
  else: # median level
2907
3395
  param_grids = {
@@ -3148,7 +3636,96 @@ def predict(
3148
3636
  'alpha': [0.1, 1.0],
3149
3637
  'max_iter': [100, 200],},
3150
3638
  "Poisson":{'alpha': [0.1, 1.0],
3151
- 'max_iter': [100, 200],}
3639
+ 'max_iter': [100, 200],},
3640
+ "Lars": {
3641
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3642
+ },
3643
+ "LassoLars": {
3644
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3645
+ },
3646
+ "BayesianRidge": {
3647
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3648
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3649
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3650
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3651
+ },
3652
+ "GammaRegressor": {
3653
+ "alpha": [0.01, 0.1, 1, 10],
3654
+ "max_iter": [1000, 5000, 10000]
3655
+ },
3656
+ "TweedieRegressor": {
3657
+ "alpha": [0.01, 0.1, 1, 10],
3658
+ "power": [0, 1, 1.5, 2, 3]
3659
+ },
3660
+ "LassoCV": {
3661
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3662
+ "cv": [3, 5, 10]
3663
+ },
3664
+ "ElasticNetCV": {
3665
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3666
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3667
+ "cv": [3, 5, 10]
3668
+ },
3669
+ "LassoLarsCV": {
3670
+ "cv": [3, 5, 10]
3671
+ },
3672
+ "LarsCV": {
3673
+ "cv": [3, 5, 10]
3674
+ },
3675
+ "OrthogonalMatchingPursuit": {
3676
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3677
+ },
3678
+ "OrthogonalMatchingPursuitCV": {
3679
+ "cv": [3, 5, 10]
3680
+ },
3681
+ "PassiveAggressiveRegressor": {
3682
+ "C": [0.01, 0.1, 1, 10],
3683
+ "max_iter": [1000, 5000, 10000],
3684
+ "early_stopping": [True, False]
3685
+ },
3686
+ "LinearSVR": {
3687
+ "C": [0.01, 0.1, 1, 10],
3688
+ "epsilon": [0.01, 0.1, 1],
3689
+ "max_iter": [1000, 5000, 10000]
3690
+ },
3691
+ "NuSVR": {
3692
+ "C": [0.01, 0.1, 1, 10],
3693
+ "nu": [0.25, 0.5, 0.75],
3694
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3695
+ },
3696
+ "DecisionTreeRegressor": {
3697
+ "max_depth": [None, 5, 10, 20],
3698
+ "min_samples_split": [2, 5, 10],
3699
+ "min_samples_leaf": [1, 2, 4]
3700
+ },
3701
+ "ExtraTreeRegressor": {
3702
+ "max_depth": [None, 5, 10, 20],
3703
+ "min_samples_split": [2, 5, 10],
3704
+ "min_samples_leaf": [1, 2, 4]
3705
+ },
3706
+ "HistGradientBoostingRegressor": {
3707
+ "learning_rate": [0.01, 0.1, 0.2],
3708
+ "max_iter": [100, 500, 1000],
3709
+ "max_depth": [None, 5, 10, 20],
3710
+ "min_samples_leaf": [1, 2, 4]
3711
+ },
3712
+ "GaussianProcessRegressor": {
3713
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3714
+ "n_restarts_optimizer": [0, 1, 5, 10]
3715
+ },
3716
+ "KernelRidge": {
3717
+ "alpha": [0.01, 0.1, 1, 10],
3718
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3719
+ "degree": [2, 3, 4]
3720
+ },
3721
+ "DummyRegressor": {
3722
+ "strategy": ["mean", "median", "constant"],
3723
+ "constant": [0] # Only if strategy is 'constant'
3724
+ },
3725
+ "TransformedTargetRegressor": {
3726
+ # Grid for the underlying regressor, example shown for LinearRegression
3727
+ "regressor__fit_intercept": [True, False]
3728
+ }
3152
3729
  }
3153
3730
 
3154
3731
  results = {}
@@ -3158,7 +3735,7 @@ def predict(
3158
3735
  if purpose == "classification"
3159
3736
  else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
3160
3737
  )
3161
-
3738
+
3162
3739
  # Train and validate each model
3163
3740
  for name, clf in tqdm(
3164
3741
  models.items(),
@@ -3168,83 +3745,132 @@ def predict(
3168
3745
  ):
3169
3746
  if verbose:
3170
3747
  print(f"\nTraining and validating {name}:")
3171
-
3172
- # Grid search with KFold or StratifiedKFold
3173
- if is_binary:
3174
- gs = GridSearchCV(
3175
- clf,
3176
- param_grid=param_grids.get(name, {}),
3177
- scoring=(
3178
- "roc_auc"
3179
- if purpose == "classification"
3180
- else "neg_mean_squared_error"
3181
- ),
3182
- cv=cv,
3183
- n_jobs=n_jobs,
3184
- verbose=verbose,
3185
- )
3186
-
3187
- gs.fit(x_train, y_train)
3188
- best_clf = gs.best_estimator_
3189
- # make sure x_train and x_test has the same name
3190
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3191
- y_pred = best_clf.predict(x_true)
3192
- if hasattr(best_clf, "predict_proba"):
3193
- y_pred_proba = best_clf.predict_proba(x_true)
3194
- print("Shape of predicted probabilities:", y_pred_proba.shape)
3195
- if y_pred_proba.shape[1] == 1:
3196
- y_pred_proba = np.hstack(
3197
- [1 - y_pred_proba, y_pred_proba]
3198
- ) # Add missing class probabilities
3199
- y_pred_proba = y_pred_proba[:, 1]
3200
- elif hasattr(best_clf, "decision_function"):
3201
- # If predict_proba is not available, use decision_function (e.g., for SVM)
3202
- y_pred_proba = best_clf.decision_function(x_true)
3203
- # Ensure y_pred_proba is within 0 and 1 bounds
3204
- y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3205
- y_pred_proba.max() - y_pred_proba.min()
3748
+ try:
3749
+ if is_binary:
3750
+ gs = GridSearchCV(
3751
+ clf,
3752
+ param_grid=param_grids.get(name, {}),
3753
+ scoring=(
3754
+ "roc_auc"
3755
+ if purpose == "classification"
3756
+ else "neg_mean_squared_error"
3757
+ ),
3758
+ cv=cv,
3759
+ n_jobs=n_jobs,
3760
+ verbose=verbose,
3206
3761
  )
3207
- else:
3208
- y_pred_proba = None # No probability output for certain models
3209
- else:
3210
- gs = GridSearchCV(
3211
- clf,
3212
- param_grid=param_grids.get(name, {}),
3213
- scoring=(
3214
- "roc_auc_ovr"
3215
- if purpose == "classification"
3216
- else "neg_mean_squared_error"
3217
- ),
3218
- cv=cv,
3219
- n_jobs=n_jobs,
3220
- verbose=verbose,
3221
- )
3222
3762
 
3223
- # Fit GridSearchCV
3224
- gs.fit(x_train, y_train)
3225
- best_clf = gs.best_estimator_
3226
-
3227
- # Ensure x_true aligns with x_train columns
3228
- x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3229
- y_pred = best_clf.predict(x_true)
3230
-
3231
- # Handle prediction probabilities for multiclass
3232
- if hasattr(best_clf, "predict_proba"):
3233
- y_pred_proba = best_clf.predict_proba(x_true)
3234
- elif hasattr(best_clf, "decision_function"):
3235
- y_pred_proba = best_clf.decision_function(x_true)
3236
-
3237
- # Normalize for multiclass if necessary
3238
- if y_pred_proba.ndim == 2:
3239
- y_pred_proba = (
3240
- y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3241
- ) / (
3242
- y_pred_proba.max(axis=1, keepdims=True)
3243
- - y_pred_proba.min(axis=1, keepdims=True)
3763
+ gs.fit(x_train, y_train)
3764
+ best_clf = gs.best_estimator_
3765
+
3766
+ # make sure x_train and x_test has the same name
3767
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3768
+ y_pred = best_clf.predict(x_true)
3769
+ if hasattr(best_clf, "predict_proba"):
3770
+ y_pred_proba = best_clf.predict_proba(x_true)
3771
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
3772
+ if y_pred_proba.shape[1] == 1:
3773
+ y_pred_proba = np.hstack(
3774
+ [1 - y_pred_proba, y_pred_proba]
3775
+ ) # Add missing class probabilities
3776
+ if y_pred_proba.shape[1] == 2:
3777
+ if isinstance(y_pred_proba, pd.DataFrame):
3778
+ y_pred_proba = y_pred_proba.iloc[:, 1]
3779
+ elif isinstance(y_pred_proba, pd.Series):
3780
+ y_pred_proba = y_pred_proba.values[:, 1]
3781
+ else:
3782
+ y_pred_proba = y_pred_proba[:, 1]
3783
+
3784
+ elif hasattr(best_clf, "decision_function"):
3785
+ # If predict_proba is not available, use decision_function (e.g., for SVM)
3786
+ y_pred_proba = best_clf.decision_function(x_true)
3787
+ # Ensure y_pred_proba is within 0 and 1 bounds
3788
+ y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
3789
+ y_pred_proba.max() - y_pred_proba.min()
3244
3790
  )
3245
- else:
3246
- y_pred_proba = None # No probability output for certain models
3791
+ else:
3792
+ y_pred_proba = None # No probability output for certain models
3793
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3794
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3795
+ if hasattr(best_clf, "alphas_"):
3796
+ alphas_ = best_clf.alphas_
3797
+ elif hasattr(best_clf, "alpha_"):
3798
+ alphas_ = best_clf.alpha_
3799
+ elif hasattr(best_clf, "Cs_"):
3800
+ alphas_ = best_clf.Cs_
3801
+ else:
3802
+ alphas_= None
3803
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3804
+ else:
3805
+ gs = GridSearchCV(
3806
+ clf,
3807
+ param_grid=param_grids.get(name, {}),
3808
+ scoring=(
3809
+ "roc_auc_ovr"
3810
+ if purpose == "classification"
3811
+ else "neg_mean_squared_error"
3812
+ ),
3813
+ cv=cv,
3814
+ n_jobs=n_jobs,
3815
+ verbose=verbose,
3816
+ )
3247
3817
 
3818
+ # Fit GridSearchCV
3819
+ gs.fit(x_train, y_train)
3820
+ best_clf = gs.best_estimator_
3821
+
3822
+ # Ensure x_true aligns with x_train columns
3823
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3824
+
3825
+ # do i need to fit the x_train, y_train again?
3826
+ best_clf=best_clf.fit(x_train, y_train)
3827
+ y_pred = best_clf.predict(x_true)
3828
+
3829
+ # Handle prediction probabilities for multiclass
3830
+ if hasattr(best_clf, "predict_proba"):
3831
+ y_pred_proba = best_clf.predict_proba(x_true)
3832
+ elif hasattr(best_clf, "decision_function"):
3833
+ y_pred_proba = best_clf.decision_function(x_true)
3834
+
3835
+ # Normalize for multiclass if necessary
3836
+ if y_pred_proba.ndim == 2:
3837
+ y_pred_proba = (
3838
+ y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
3839
+ ) / (
3840
+ y_pred_proba.max(axis=1, keepdims=True)
3841
+ - y_pred_proba.min(axis=1, keepdims=True)
3842
+ )
3843
+ else:
3844
+ y_pred_proba = None # No probability output for certain models
3845
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
3846
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
3847
+ if hasattr(best_clf, "alphas_"):
3848
+ alphas_ = best_clf.alphas_
3849
+ elif hasattr(best_clf, "alpha_"):
3850
+ alphas_ = best_clf.alpha_
3851
+ elif hasattr(best_clf, "Cs_"):
3852
+ alphas_ = best_clf.Cs_
3853
+ else:
3854
+ alphas_= None
3855
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3856
+ except Exception as e:
3857
+ alphas_,coef_ = None,None
3858
+ print(f"skiped {clf}: {e}")
3859
+ continue
3860
+
3861
+ # try to make predict format consistant
3862
+ try:
3863
+ y_pred= [i[0] for i in y_pred]
3864
+ except:
3865
+ pass
3866
+ try:
3867
+ y_true= [i[0] for i in y_true]
3868
+ except:
3869
+ pass
3870
+ try:
3871
+ y_train= [i[0] for i in y_train]
3872
+ except:
3873
+ pass
3248
3874
  validation_scores = {}
3249
3875
 
3250
3876
  if y_true is not None and y_pred_proba is not None:
@@ -3294,20 +3920,26 @@ def predict(
3294
3920
  "roc_curve": roc_info,
3295
3921
  "pr_curve": pr_info,
3296
3922
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3297
- "predictions": y_pred.tolist(),
3923
+ "predictions": y_pred,#.tolist(),
3298
3924
  "predictions_proba": (
3299
3925
  y_pred_proba.tolist() if y_pred_proba is not None else None
3300
3926
  ),
3927
+ "features":share_col_names,
3928
+ "coef":coef_,
3929
+ "alphas":alphas_
3301
3930
  }
3302
3931
  else: # "regression"
3303
3932
  results[name] = {
3304
3933
  "best_clf": gs.best_estimator_,
3305
3934
  "best_params": gs.best_params_,
3306
3935
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3307
- "predictions": y_pred.tolist(),
3936
+ "predictions": y_pred,#.tolist(),
3308
3937
  "predictions_proba": (
3309
3938
  y_pred_proba.tolist() if y_pred_proba is not None else None
3310
3939
  ),
3940
+ "features":share_col_names,
3941
+ "coef":coef_,
3942
+ "alphas":alphas_
3311
3943
  }
3312
3944
  else: # multi-classes
3313
3945
  if y_pred_proba is not None:
@@ -3346,20 +3978,26 @@ def predict(
3346
3978
  "roc_curve": roc_info,
3347
3979
  "pr_curve": pr_info,
3348
3980
  "confusion_matrix": confusion_matrix(y_true, y_pred),
3349
- "predictions": y_pred.tolist(),
3981
+ "predictions": y_pred,#.tolist(),
3350
3982
  "predictions_proba": (
3351
3983
  y_pred_proba.tolist() if y_pred_proba is not None else None
3352
3984
  ),
3985
+ "features":share_col_names,
3986
+ "coef":coef_,
3987
+ "alphas":alphas_
3353
3988
  }
3354
3989
  else: # "regression"
3355
3990
  results[name] = {
3356
3991
  "best_clf": gs.best_estimator_,
3357
3992
  "best_params": gs.best_params_,
3358
3993
  "scores": validation_scores, # e.g., neg_MSE, R², etc.
3359
- "predictions": y_pred.tolist(),
3994
+ "predictions": y_pred,#.tolist(),
3360
3995
  "predictions_proba": (
3361
3996
  y_pred_proba.tolist() if y_pred_proba is not None else None
3362
3997
  ),
3998
+ "features":share_col_names,
3999
+ "coef":coef_,
4000
+ "alphas":alphas_
3363
4001
  }
3364
4002
 
3365
4003
  else:
@@ -3378,17 +4016,21 @@ def predict(
3378
4016
  "best_clf": gs.best_estimator_,
3379
4017
  "best_params": gs.best_params_,
3380
4018
  "scores": validation_scores,
3381
- "predictions": y_pred.tolist(),
4019
+ "predictions": y_pred,#.tolist(),
3382
4020
  "predictions_proba": (
3383
4021
  y_pred_proba.tolist() if y_pred_proba is not None else None
3384
4022
  ),
4023
+ "features":share_col_names,
3385
4024
  "y_train": y_train if y_train is not None else [],
3386
4025
  "y_true": y_true if y_true is not None else [],
4026
+ "coef":coef_,
4027
+ "alphas":alphas_
3387
4028
  }
3388
4029
 
3389
4030
  # Convert results to DataFrame
3390
4031
  df_results = pd.DataFrame.from_dict(results, orient="index")
3391
- # sort
4032
+ display(df_results)
4033
+ # sort
3392
4034
  if y_true is not None:
3393
4035
  if purpose == "classification":
3394
4036
  df_scores = pd.DataFrame(
@@ -3446,7 +4088,7 @@ def predict(
3446
4088
  for i, j in top_models.to_dict().items():
3447
4089
  base_estimators.append((i, j))
3448
4090
  if stacking_cv:
3449
- print(f" ⤵ stacking_cv is processing...")
4091
+ print(f"⤵ stacking_cv is processing...")
3450
4092
  #* 定义几个象征性的final_estimator
3451
4093
  # 备选的几种
3452
4094
  if purpose == "classification":
@@ -3520,7 +4162,7 @@ def predict(
3520
4162
  best_final_estimator = cv_results_df.iloc[0]['final_estimator']
3521
4163
  print(f"Best final estimator based on cross-validation: {best_final_estimator}")
3522
4164
  else:
3523
- print(f" ⤵ trying to find the best_final_estimator for stacking...")
4165
+ print(f"⤵ trying to find the best_final_estimator for stacking...")
3524
4166
  if purpose=="classification":
3525
4167
  best_final_estimator = LogisticRegression(class_weight=class_weight,
3526
4168
  random_state=random_state,
@@ -3530,26 +4172,25 @@ def predict(
3530
4172
  print(f"⤵ the best best_final_estimator: {best_final_estimator}")
3531
4173
  #! apply stacking
3532
4174
  if purpose == "classification":
3533
- print(f" ⤵ StackingClassifier...")
4175
+ print(f"⤵ StackingClassifier...")
3534
4176
  stacking_model = StackingClassifier(estimators=base_estimators,
3535
4177
  final_estimator=best_final_estimator,
3536
4178
  cv=cv)
3537
4179
  else:
3538
- print(f" ⤵ StackingRegressor...")
4180
+ print(f"⤵ StackingRegressor...")
3539
4181
  stacking_model = StackingRegressor(estimators=base_estimators,
3540
4182
  final_estimator=best_final_estimator,
3541
4183
  cv=cv)
3542
4184
 
3543
4185
  # Train the Stacking Classifier
3544
- print(f" ⤵ fit & predict...")
4186
+ print(f"⤵ fit & predict...")
3545
4187
  stacking_model.fit(x_train, y_train)
3546
4188
  y_pred_final = stacking_model.predict(x_true)
3547
- print(f" ⤵ collecting results...")
4189
+ print(f"⤵ collecting results...")
3548
4190
  # pred_proba
3549
4191
  if is_binary:
3550
4192
  if hasattr(stacking_model, "predict_proba"):
3551
4193
  y_pred_proba_final = stacking_model.predict_proba(x_true)
3552
- print("Shape of predicted probabilities:", y_pred_proba_final.shape)
3553
4194
  if y_pred_proba_final.shape[1] == 1:
3554
4195
  y_pred_proba_final = np.hstack(
3555
4196
  [1 - y_pred_proba_final, y_pred_proba_final]
@@ -3564,6 +4205,17 @@ def predict(
3564
4205
  )
3565
4206
  else:
3566
4207
  y_pred_proba_final = None # No probability output for certain models
4208
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
4209
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
4210
+ if hasattr(best_clf, "alphas_"):
4211
+ alphas_ = best_clf.alphas_
4212
+ elif hasattr(best_clf, "alpha_"):
4213
+ alphas_ = best_clf.alpha_
4214
+ elif hasattr(best_clf, "Cs_"):
4215
+ alphas_ = best_clf.Cs_
4216
+ else:
4217
+ alphas_= None
4218
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3567
4219
  if not is_binary:
3568
4220
  # Handle prediction probabilities for multiclass
3569
4221
  if hasattr(stacking_model, "predict_proba"):
@@ -3581,6 +4233,17 @@ def predict(
3581
4233
  )
3582
4234
  else:
3583
4235
  y_pred_proba_final = None # No probability output for certain models
4236
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
4237
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
4238
+ if hasattr(best_clf, "alphas_"):
4239
+ alphas_ = best_clf.alphas_
4240
+ elif hasattr(best_clf, "alpha_"):
4241
+ alphas_ = best_clf.alpha_
4242
+ elif hasattr(best_clf, "Cs_"):
4243
+ alphas_ = best_clf.Cs_
4244
+ else:
4245
+ alphas_= None
4246
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3584
4247
  #! dict_pred_stack
3585
4248
  dict_pred_stack={}
3586
4249
  validation_scores_final = {}
@@ -3631,6 +4294,9 @@ def predict(
3631
4294
  "predictions_proba": (
3632
4295
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3633
4296
  ),
4297
+ "features":share_col_names,
4298
+ "coef":coef_,
4299
+ "alphas":alphas_
3634
4300
  }
3635
4301
  else: # "regression"
3636
4302
  dict_pred_stack = {
@@ -3641,6 +4307,9 @@ def predict(
3641
4307
  "predictions_proba": (
3642
4308
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3643
4309
  ),
4310
+ "features":share_col_names,
4311
+ "coef":coef_,
4312
+ "alphas":alphas_
3644
4313
  }
3645
4314
  else: # multi-classes
3646
4315
  if y_pred_proba_final is not None:
@@ -3680,6 +4349,9 @@ def predict(
3680
4349
  "predictions_proba": (
3681
4350
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3682
4351
  ),
4352
+ "features":share_col_names,
4353
+ "coef":coef_,
4354
+ "alphas":alphas_
3683
4355
  }
3684
4356
  else: # "regression"
3685
4357
  dict_pred_stack = {
@@ -3690,6 +4362,9 @@ def predict(
3690
4362
  "predictions_proba": (
3691
4363
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3692
4364
  ),
4365
+ "features":share_col_names,
4366
+ "coef":coef_,
4367
+ "alphas":alphas_
3693
4368
  }
3694
4369
 
3695
4370
  else:
@@ -3712,8 +4387,11 @@ def predict(
3712
4387
  "predictions_proba": (
3713
4388
  y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
3714
4389
  ),
4390
+ "features":share_col_names,
3715
4391
  "y_train": y_train if y_train is not None else [],
3716
4392
  "y_true": y_true if y_true is not None else [],
4393
+ "coef":coef_,
4394
+ "alphas":alphas_
3717
4395
  }
3718
4396
  # merge together
3719
4397
  df_pred = pd.DataFrame(
@@ -3728,16 +4406,16 @@ def predict(
3728
4406
  # if dir_save:
3729
4407
  # ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
3730
4408
  if vote:
3731
- print(f" ⤵ voting...")
4409
+ print(f"⤵ voting...")
3732
4410
  from sklearn.ensemble import VotingClassifier, VotingRegressor
3733
- #! Votting
4411
+ #! voting
3734
4412
  n_top_models = min(n_top_models, df_results.shape[0])
3735
4413
  base_estimators=[]
3736
4414
  for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
3737
4415
  base_estimators.append((name,cls))
3738
4416
  # Apply Voting Classifier/Regressor
3739
4417
  if purpose == "classification":
3740
- print(f" ⤵ VotingClassifier...via{votting}")
4418
+ print(f"⤵ VotingClassifier...via{voting}")
3741
4419
  if voting=='hard':
3742
4420
  # Hard voting does not support `predict_proba`
3743
4421
  voting_model = VotingClassifier(estimators=base_estimators)
@@ -3745,7 +4423,7 @@ def predict(
3745
4423
  # Soft voting supports `predict_proba`
3746
4424
  voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
3747
4425
  else:
3748
- print(f" ⤵ VotingRegressor...")
4426
+ print(f"⤵ VotingRegressor...")
3749
4427
  voting_model = VotingRegressor(estimators=base_estimators)
3750
4428
 
3751
4429
  # Train the Voting Classifier/Regressor
@@ -3770,10 +4448,23 @@ def predict(
3770
4448
  y_pred_proba_vote = y_pred_proba_vote[:, 1]
3771
4449
  else:
3772
4450
  y_pred_proba_vote = None
4451
+
4452
+ # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
4453
+ if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
4454
+ if hasattr(best_clf, "alphas_"):
4455
+ alphas_ = best_clf.alphas_
4456
+ elif hasattr(best_clf, "alpha_"):
4457
+ alphas_ = best_clf.alpha_
4458
+ elif hasattr(best_clf, "Cs_"):
4459
+ alphas_ = best_clf.Cs_
4460
+ else:
4461
+ alphas_= None
4462
+ coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3773
4463
  else: # Regression
3774
4464
  y_pred_proba_vote = None
4465
+ coef_,alphas_=None,None
3775
4466
 
3776
- print(f" ⤵ collecting voting results...")
4467
+ print(f"⤵ collecting voting results...")
3777
4468
  #! dict_pred_vote
3778
4469
  dict_pred_vote = {}
3779
4470
  validation_scores_vote = {}
@@ -3822,6 +4513,9 @@ def predict(
3822
4513
  "predictions_proba": (
3823
4514
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3824
4515
  ),
4516
+ "features":share_col_names,
4517
+ "coef":coef_,
4518
+ "alphas":alphas_
3825
4519
  }
3826
4520
  else: # Multi-class
3827
4521
  if y_pred_proba_vote is not None:
@@ -3856,6 +4550,9 @@ def predict(
3856
4550
  "predictions_proba": (
3857
4551
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3858
4552
  ),
4553
+ "features":share_col_names,
4554
+ "coef":coef_,
4555
+ "alphas":alphas_
3859
4556
  }
3860
4557
  else:
3861
4558
  if y_true is None:
@@ -3877,6 +4574,7 @@ def predict(
3877
4574
  "predictions_proba": (
3878
4575
  y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
3879
4576
  ),
4577
+ "features":share_col_names,
3880
4578
  "y_train": y_train if y_train is not None else [],
3881
4579
  "y_true": y_true if y_true is not None else [],
3882
4580
  }
@@ -3900,6 +4598,8 @@ def predict(
3900
4598
  df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
3901
4599
  elif stack:
3902
4600
  df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
4601
+ else:
4602
+ df_res=df_results
3903
4603
 
3904
4604
  if all([plot_, y_true is not None, purpose == "classification"]):
3905
4605
  from datetime import datetime
@@ -3907,9 +4607,15 @@ def predict(
3907
4607
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
3908
4608
  # try:
3909
4609
  if df_res.shape[0] > 3:
3910
- plot_validate_features(df_res, is_binary=is_binary)
4610
+ try:
4611
+ plot_validate_features(df_res, is_binary=is_binary)
4612
+ except Exception as e:
4613
+ print(e)
3911
4614
  else:
3912
- plot_validate_features_single(df_res, is_binary=is_binary)
4615
+ try:
4616
+ plot_validate_features_single(df_res, is_binary=is_binary)
4617
+ except Exception as e:
4618
+ print(e)
3913
4619
  if dir_save:
3914
4620
  ips.figsave(dir_save + f"validate_features{now_}.pdf")
3915
4621
  # except Exception as e: