py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
31
31
  average_precision_score,
32
32
  )
33
33
  from typing import Dict, Any, Optional, List, Union
34
+ import os, json
34
35
  import numpy as np
35
36
  import pandas as pd
36
37
  from . import ips
@@ -49,7 +50,13 @@ logger = logging.getLogger()
49
50
  warnings.filterwarnings("ignore", category=UserWarning)
50
51
  from sklearn.tree import DecisionTreeClassifier
51
52
  from sklearn.neighbors import KNeighborsClassifier
52
-
53
+ #* set random_state global
54
+ import torch
55
+ import random
56
+ random_state=1
57
+ random.seed(random_state)
58
+ np.random.seed(random_state)
59
+ torch.manual_seed(random_state)
53
60
 
54
61
  def features_knn(
55
62
  x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
@@ -594,7 +601,7 @@ def get_features(
594
601
  """
595
602
  from sklearn.compose import ColumnTransformer
596
603
  from sklearn.preprocessing import StandardScaler, OneHotEncoder
597
-
604
+ from sklearn.model_selection import train_test_split
598
605
  # Ensure X and y are DataFrames/Series for consistency
599
606
  if isinstance(X, np.ndarray):
600
607
  X = pd.DataFrame(X)
@@ -922,10 +929,26 @@ def get_features(
922
929
  "feature_importances": feature_importances,
923
930
  }
924
931
  if all([plot_, dir_save]):
932
+
925
933
  from datetime import datetime
926
-
927
934
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
928
935
  ips.figsave(dir_save + f"features{now_}.pdf")
936
+
937
+ lists = []
938
+ for tp in ips.flatten(features_df["type"]):
939
+ lists.append(
940
+ features_df
941
+ .loc[features_df["type"] == tp, "feature"]
942
+ .tolist()
943
+ )
944
+ labels = ips.flatten(features_df["type"])
945
+ # current_fig = plt.gcf()
946
+ # # ax = current_fig.add_subplot(3, 2, 6)
947
+ # gs = current_fig.add_gridspec(3, 2)
948
+ # ax = current_fig.add_subplot(gs[:, :])
949
+ plt.figure(figsize=[6,6])
950
+ plot.venn(lists, labels, cmap="coolwarm")
951
+ ips.figsave(dir_save + f"features{now_}shared_features.pdf")
929
952
  else:
930
953
  results = {
931
954
  "selected_features": pd.DataFrame(),
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1247
1270
  nexttile = plot.subplot(figsize=figsize)
1248
1271
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1249
1272
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1250
- fpr = res_val["roc_curve"][model_name]["fpr"]
1251
- tpr = res_val["roc_curve"][model_name]["tpr"]
1252
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1253
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1254
- plot_roc_curve(
1255
- fpr,
1256
- tpr,
1257
- mean_auc,
1258
- lower_ci,
1259
- upper_ci,
1260
- model_name=model_name,
1261
- lw=1.5,
1262
- color=colors[i],
1263
- alpha=alpha,
1264
- ax=ax,
1265
- )
1273
+ try:
1274
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1275
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1276
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1277
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1278
+ plot_roc_curve(
1279
+ fpr,
1280
+ tpr,
1281
+ mean_auc,
1282
+ lower_ci,
1283
+ upper_ci,
1284
+ model_name=model_name,
1285
+ lw=1.5,
1286
+ color=colors[i],
1287
+ alpha=alpha,
1288
+ ax=ax,
1289
+ )
1290
+ except Exception as e:
1291
+ print(e)
1266
1292
  plot.figsets(
1267
1293
  sp=2,
1268
1294
  legend=dict(
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1277
1303
 
1278
1304
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1279
1305
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1280
- plot_pr_curve(
1281
- recall=res_val["pr_curve"][model_name]["recall"],
1282
- precision=res_val["pr_curve"][model_name]["precision"],
1283
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1284
- model_name=model_name,
1285
- color=colors[i],
1286
- lw=1.5,
1287
- alpha=alpha,
1288
- ax=ax,
1289
- )
1306
+ try:
1307
+ plot_pr_curve(
1308
+ recall=res_val["pr_curve"][model_name]["recall"],
1309
+ precision=res_val["pr_curve"][model_name]["precision"],
1310
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1311
+ model_name=model_name,
1312
+ color=colors[i],
1313
+ lw=1.5,
1314
+ alpha=alpha,
1315
+ ax=ax,
1316
+ )
1317
+ except Exception as e:
1318
+ print(e)
1290
1319
  plot.figsets(
1291
1320
  sp=2,
1292
1321
  legend=dict(
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1314
1343
  for iclass, class_ in enumerate(classes):
1315
1344
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1316
1345
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1317
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1318
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1319
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1320
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1321
- plot_roc_curve(
1322
- fpr,
1323
- tpr,
1324
- mean_auc,
1325
- lower_ci,
1326
- upper_ci,
1327
- model_name=model_name,
1328
- lw=1.5,
1329
- color=colors[i],
1330
- alpha=alpha,
1331
- ax=ax,
1332
- )
1346
+ try:
1347
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1348
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1349
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1350
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1351
+ plot_roc_curve(
1352
+ fpr,
1353
+ tpr,
1354
+ mean_auc,
1355
+ lower_ci,
1356
+ upper_ci,
1357
+ model_name=model_name,
1358
+ lw=1.5,
1359
+ color=colors[i],
1360
+ alpha=alpha,
1361
+ ax=ax,
1362
+ )
1363
+ except Exception as e:
1364
+ print(e)
1333
1365
  plot.figsets(
1334
1366
  sp=2,
1335
1367
  title=class_,
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1345
1377
 
1346
1378
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1347
1379
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1348
- plot_pr_curve(
1349
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1350
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1351
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1352
- iclass
1353
- ],
1354
- model_name=model_name,
1355
- color=colors[i],
1356
- lw=1.5,
1357
- alpha=alpha,
1358
- ax=ax,
1359
- )
1380
+ try:
1381
+ plot_pr_curve(
1382
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1383
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1384
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1385
+ iclass
1386
+ ],
1387
+ model_name=model_name,
1388
+ color=colors[i],
1389
+ lw=1.5,
1390
+ alpha=alpha,
1391
+ ax=ax,
1392
+ )
1393
+ except Exception as e:
1394
+ print(e)
1360
1395
  plot.figsets(
1361
1396
  sp=2,
1362
1397
  title=class_,
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1379
1414
  len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
1380
1415
  )
1381
1416
  for model_name in ips.flatten(res_val["pr_curve"].index):
1382
- fpr = res_val["roc_curve"][model_name]["fpr"]
1383
- tpr = res_val["roc_curve"][model_name]["tpr"]
1384
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1385
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1386
-
1387
- # Plotting
1388
- plot_roc_curve(
1389
- fpr,
1390
- tpr,
1391
- mean_auc,
1392
- lower_ci,
1393
- upper_ci,
1394
- model_name=model_name,
1395
- ax=nexttile(),
1396
- )
1397
- plot.figsets(title=model_name, sp=2)
1417
+ try:
1418
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1419
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1420
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1421
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1398
1422
 
1399
- plot_pr_binary(
1400
- recall=res_val["pr_curve"][model_name]["recall"],
1401
- precision=res_val["pr_curve"][model_name]["precision"],
1402
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1403
- model_name=model_name,
1404
- ax=nexttile(),
1405
- )
1406
- plot.figsets(title=model_name, sp=2)
1423
+ # Plotting
1424
+ plot_roc_curve(
1425
+ fpr,
1426
+ tpr,
1427
+ mean_auc,
1428
+ lower_ci,
1429
+ upper_ci,
1430
+ model_name=model_name,
1431
+ ax=nexttile(),
1432
+ )
1433
+ plot.figsets(title=model_name, sp=2)
1407
1434
 
1408
- # plot cm
1409
- plot_cm(
1410
- res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1411
- )
1412
- plot.figsets(title=model_name, sp=2)
1435
+ plot_pr_binary(
1436
+ recall=res_val["pr_curve"][model_name]["recall"],
1437
+ precision=res_val["pr_curve"][model_name]["precision"],
1438
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1439
+ model_name=model_name,
1440
+ ax=nexttile(),
1441
+ )
1442
+ plot.figsets(title=model_name, sp=2)
1443
+
1444
+ # plot cm
1445
+ plot_cm(
1446
+ res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1447
+ )
1448
+ plot.figsets(title=model_name, sp=2)
1449
+
1450
+ except Exception as e:
1451
+ print(e)
1413
1452
  else:
1414
1453
 
1415
1454
  modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1424
1463
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1425
1464
  ax = nexttile()
1426
1465
  for iclass, class_ in enumerate(classes):
1427
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1428
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1429
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1430
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1431
- plot_roc_curve(
1432
- fpr,
1433
- tpr,
1434
- mean_auc,
1435
- lower_ci,
1436
- upper_ci,
1437
- model_name=class_,
1438
- lw=1.5,
1439
- color=colors[iclass],
1440
- alpha=0.03,
1441
- ax=ax,
1442
- )
1466
+ try:
1467
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1468
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1469
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1470
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1471
+ plot_roc_curve(
1472
+ fpr,
1473
+ tpr,
1474
+ mean_auc,
1475
+ lower_ci,
1476
+ upper_ci,
1477
+ model_name=class_,
1478
+ lw=1.5,
1479
+ color=colors[iclass],
1480
+ alpha=0.03,
1481
+ ax=ax,
1482
+ )
1483
+ except Exception as e:
1484
+ print(e)
1443
1485
  plot.figsets(
1444
1486
  sp=2,
1445
1487
  title=model_name,
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1451
1493
 
1452
1494
  ax = nexttile()
1453
1495
  for iclass, class_ in enumerate(classes):
1454
- plot_pr_curve(
1455
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1456
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1457
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1458
- iclass
1459
- ],
1460
- model_name=class_,
1461
- color=colors[iclass],
1462
- lw=1.5,
1463
- alpha=0.03,
1464
- ax=ax,
1465
- )
1496
+ try:
1497
+ plot_pr_curve(
1498
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1499
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1500
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1501
+ iclass
1502
+ ],
1503
+ model_name=class_,
1504
+ color=colors[iclass],
1505
+ lw=1.5,
1506
+ alpha=0.03,
1507
+ ax=ax,
1508
+ )
1509
+ except Exception as e:
1510
+ print(e)
1466
1511
  plot.figsets(
1467
1512
  sp=2,
1468
1513
  title=class_,
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
1543
1588
  # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
1544
1589
  sorted_scores = np.array(bootstrapped_scores)
1545
1590
  sorted_scores.sort()
1546
-
1547
- # Computing the lower and upper bound of the 90% confidence interval
1548
- # You can change the bounds percentiles to 0.025 and 0.975 to get
1549
- # a 95% confidence interval instead.
1591
+
1550
1592
  confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1551
1593
  confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1552
1594
  if verbose:
1553
1595
  print(
1554
- "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
1596
+ "Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
1555
1597
  confidence_lower, confidence_upper
1556
1598
  )
1557
1599
  )
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
1568
1610
  y_true, classes=np.unique(y_true)
1569
1611
  ) # One-vs-Rest transformation
1570
1612
  n_classes = y_true_bin.shape[1] # Number of classes
1571
-
1572
- bootstrapped_scores = np.zeros(
1573
- (n_classes, n_bootstraps)
1574
- ) # Store scores for each class
1575
-
1613
+
1614
+ bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
1576
1615
  if verbose:
1577
1616
  print("AUROC scores for each class:")
1578
1617
  for i in range(n_classes):
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
1592
1631
  # Calculating the confidence intervals for each class
1593
1632
  confidence_intervals = []
1594
1633
  for class_idx in range(n_classes):
1595
- sorted_scores = np.sort(bootstrapped_scores[class_idx])
1596
- confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1597
- confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1598
- confidence_intervals.append((confidence_lower, confidence_upper))
1599
-
1600
- if verbose:
1601
- print(
1602
- f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1603
- )
1634
+ # rm nan
1635
+ valid_scores = bootstrapped_scores[class_idx][
1636
+ ~np.isnan(bootstrapped_scores[class_idx])
1637
+ ]
1638
+ if len(valid_scores) > 0:
1639
+ sorted_scores = np.sort(valid_scores)
1640
+ confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1641
+ confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1642
+ confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
1643
+
1644
+ if verbose:
1645
+ print(
1646
+ f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1647
+ )
1648
+ else:
1649
+ confidence_intervals[class_idx] = (np.nan, np.nan)
1650
+ if verbose:
1651
+ print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
1604
1652
 
1605
1653
  return confidence_intervals
1606
1654
 
@@ -2057,20 +2105,20 @@ def rank_models(
2057
2105
 
2058
2106
  def generate_bar_plot(ax, cv_test_scores):
2059
2107
  ax = plot.plotxy(
2060
- y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
2108
+ y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
2061
2109
  )
2062
2110
  plt.title("Classifier Performance")
2063
2111
  plt.tight_layout()
2064
2112
  return plt
2065
2113
 
2066
- nexttile = plot.subplot(2, 2, figsize=[10, 7])
2114
+ nexttile = plot.subplot(2, 2, figsize=[10, 10])
2067
2115
  generate_bar_plot(nexttile(), top_models.dropna())
2068
2116
  plot.radar(
2069
2117
  ax=nexttile(projection="polar"),
2070
2118
  data=cv_test_scores.set_index("Classifier"),
2071
- ylim=[0.5, 1],
2072
- color=plot.get_color(10),
2073
- alpha=0.05,
2119
+ ylim=[0, 1],
2120
+ color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
2121
+ alpha=0.02,
2074
2122
  circular=1,
2075
2123
  )
2076
2124
  return cv_test_scores
@@ -2216,7 +2264,7 @@ def predict(
2216
2264
  metrics: Optional[List[str]] = None,
2217
2265
  stack:bool=True,# run stacking
2218
2266
  stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2219
- vote:bool=True,# run voting
2267
+ vote:bool=False,# run voting
2220
2268
  voting:str="hard", # only for classification purporse of voting
2221
2269
  n_top_models:int=5, #for stacking models
2222
2270
  n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
@@ -2229,7 +2277,12 @@ def predict(
2229
2277
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2230
2278
  class_weight: str = "balanced",
2231
2279
  random_state: int = 1,
2280
+ presets = "best_quality",# specific for autogluon
2281
+ time_limit=600, # specific for autogluon
2282
+ num_bag_folds=5, # specific for autogluon
2283
+ num_stack_levels=2, # specific for autogluon
2232
2284
  verbose: bool = False,
2285
+ **kwargs
2233
2286
  ) -> pd.DataFrame:
2234
2287
  """
2235
2288
  第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
@@ -2280,28 +2333,20 @@ def predict(
2280
2333
  RandomForestRegressor,
2281
2334
  ExtraTreesClassifier,
2282
2335
  ExtraTreesRegressor,
2336
+ HistGradientBoostingRegressor,
2283
2337
  BaggingClassifier,
2284
2338
  BaggingRegressor,
2285
2339
  AdaBoostClassifier,
2286
2340
  AdaBoostRegressor,
2287
2341
  )
2288
- from sklearn.svm import SVC, SVR
2289
- from sklearn.tree import DecisionTreeRegressor
2342
+ from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
2343
+ from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
2290
2344
  from sklearn.linear_model import (
2291
- LogisticRegression,
2292
- ElasticNet,
2293
- ElasticNetCV,
2294
- LinearRegression,
2295
- Lasso,
2296
- RidgeClassifierCV,
2297
- Perceptron,
2298
- SGDClassifier,
2299
- RidgeCV,
2300
- Ridge,
2301
- TheilSenRegressor,
2302
- HuberRegressor,
2303
- PoissonRegressor,
2304
-
2345
+ LogisticRegression,ElasticNet,ElasticNetCV,
2346
+ LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
2347
+ RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
2348
+ GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
2349
+ OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
2305
2350
  )
2306
2351
  from sklearn.compose import TransformedTargetRegressor
2307
2352
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
@@ -2318,10 +2363,16 @@ def predict(
2318
2363
  )
2319
2364
  from sklearn.preprocessing import PolynomialFeatures
2320
2365
  from sklearn.model_selection import train_test_split
2321
-
2366
+
2367
+ from sklearn.gaussian_process import GaussianProcessRegressor
2368
+ from sklearn.kernel_ridge import KernelRidge
2369
+ from sklearn.dummy import DummyRegressor
2370
+ from autogluon.tabular import TabularPredictor
2322
2371
  # 拼写检查
2323
2372
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
2324
2373
  print(f"{purpose} processing...")
2374
+
2375
+
2325
2376
  # Default models or regressors if not provided
2326
2377
  if purpose == "classification":
2327
2378
  model_ = {
@@ -2374,27 +2425,44 @@ def predict(
2374
2425
  model_ = {
2375
2426
  "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2376
2427
  "SVM": SVR(), # SVR (Support Vector Regression)
2377
- # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2378
- "LassoCV": LassoCV(
2379
- cv=cv_folds, random_state=random_state,n_jobs=n_jobs
2380
- ), # LassoCV自动找出最适alpha,优于Lasso
2428
+ "LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
2381
2429
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2382
2430
  "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2383
2431
  "Linear Regression": LinearRegression(n_jobs=n_jobs),
2384
2432
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2385
- "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
2386
- force_row_wise=True # Or use force_col_wise=True if memory is a concern
2387
- ),
2433
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
2388
2434
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2389
2435
  "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2390
2436
  "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2391
2437
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2392
2438
  "ElasticNet": ElasticNet(random_state=random_state),
2393
- "Ridge": Ridge(),
2439
+ "Ridge": Ridge(random_state=random_state),
2394
2440
  "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2395
2441
  "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2396
2442
  "Huber":HuberRegressor(),
2397
- "Poisson":PoissonRegressor()
2443
+ "Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
2444
+ "Lasso": Lasso(random_state=random_state),
2445
+ "Lars": Lars(),
2446
+ "LassoLars": LassoLars(),
2447
+ "BayesianRidge": BayesianRidge(),
2448
+ "GammaRegressor": GammaRegressor(),
2449
+ "TweedieRegressor": TweedieRegressor(),
2450
+ "LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
2451
+ "ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
2452
+ "LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
2453
+ "LarsCV": LarsCV(),
2454
+ "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
2455
+ "OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
2456
+ "PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
2457
+ "LinearSVR": LinearSVR(random_state=random_state),
2458
+ "NuSVR": NuSVR(),
2459
+ "DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
2460
+ "ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
2461
+ "HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
2462
+ "GaussianProcessRegressor": GaussianProcessRegressor(),
2463
+ "KernelRidge": KernelRidge(),
2464
+ "DummyRegressor": DummyRegressor(),
2465
+ "TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
2398
2466
  }
2399
2467
  if cls is None:
2400
2468
  models = model_
@@ -2411,10 +2479,17 @@ def predict(
2411
2479
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2412
2480
  )
2413
2481
 
2482
+ # only keep "autogluon_tab" in models
2483
+ cls = [cls] if isinstance(cls, str) else cls
2484
+
2485
+ if cls is not None:
2486
+ models={"autogluon_tab":None} if "auto" in cls else models
2487
+
2414
2488
  # indicate cls:
2415
2489
  if ips.run_once_within(30): # 10 min
2416
2490
  print(f"processing: {list(models.keys())}")
2417
- print(isinstance(y_train, str) and y_train in x_train.columns)
2491
+ y_train_col_name=None
2492
+ # print(isinstance(y_train, str) and y_train in x_train.columns)
2418
2493
  if isinstance(y_train, str) and y_train in x_train.columns:
2419
2494
  y_train_col_name = y_train
2420
2495
  y_train = x_train[y_train]
@@ -2531,19 +2606,261 @@ def predict(
2531
2606
  if isinstance(y_train, np.ndarray):
2532
2607
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2533
2608
  y_true = np.asarray(y_true)
2609
+ #! so far, got the: x_train,x_true,y_train,y_true
2610
+ # Grid search with KFold or StratifiedKFold
2611
+ if "autogluon_tab" in models:
2612
+ # load hypoer_param
2613
+ f_param = os.path.dirname(os.path.abspath(__file__))
2614
+ f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
2615
+ with open(f_param, "r") as file:
2616
+ hyper_param_autogluon = json.load(file)
2617
+ # Train the model with AutoGluon
2618
+ features=x_train.columns.tolist()
2619
+ label= y_train_col_name if y_train_col_name is not None else 'target'
2620
+ df_autogluon = x_train.copy()
2621
+ df_autogluon[label]=y_train
2622
+ autogluon_presets=["best_quality","good_quality","fast_train"]
2623
+ best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
2624
+ train_data=df_autogluon,
2625
+ presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
2626
+ time_limit=time_limit,#3600, # in sec: Limit training time,
2627
+ num_bag_folds=num_bag_folds,
2628
+ num_stack_levels=num_stack_levels,
2629
+ hyperparameters=hyper_param_autogluon,
2630
+ verbosity=1 if verbose else 0,
2631
+ **kwargs
2632
+ )
2633
+ #! Get the leaderboard
2634
+ gs={}
2635
+ # Display the leaderboard for reference
2636
+ leaderboard = best_clf.leaderboard()
2637
+ gs['info']=best_clf.info()
2638
+ # gs["res"]=best_clf
2639
+ gs["features"]=features
2640
+ gs["leaderboard"] = leaderboard
2641
+ best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
2642
+ # Store the best model and its details in the gs dictionary
2643
+ gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
2644
+ gs["best_params_"] = best_model_name # Hyperparameters
2645
+ # Make predictions if x_true is provided
2646
+ if x_true is not None:
2647
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2648
+ gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
2649
+ gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
2650
+ x_true[label]=gs["predictions"]
2651
+ if gs["predictions"].value_counts().shape[0]>1:
2652
+ gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
2653
+ gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
2654
+ all_models = gs["models"]
2655
+ model_evaluations = {}
2656
+ for model in all_models:
2657
+ predictions = best_clf.predict(x_true[features], model=model)
2658
+ evaluation = best_clf.evaluate_predictions(
2659
+ y_true=x_true[label], # True labels
2660
+ y_pred=predictions, # Predictions from the specific model
2661
+ auxiliary_metrics=True, # Include additional metrics if needed
2662
+ )
2663
+ model_evaluations[model] = evaluation
2664
+ gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
2665
+ #! 试着保持一样的格式
2666
+ results = {}
2667
+ for model in all_models:
2668
+ y_pred = best_clf.predict(x_true[features], model=model).tolist()
2669
+ y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
2670
+
2671
+ if isinstance(y_pred_proba, pd.DataFrame):
2672
+ y_pred_proba=y_pred_proba.iloc[:,1]
2673
+
2674
+ # try to make predict format consistant
2675
+ try:
2676
+ y_pred= [i[0] for i in y_pred]
2677
+ except:
2678
+ pass
2679
+ try:
2680
+ y_true= [i[0] for i in y_true]
2681
+ except:
2682
+ pass
2683
+ try:
2684
+ y_train= [i[0] for i in y_train]
2685
+ except:
2686
+ pass
2687
+ validation_scores = {}
2688
+ if y_true is not None and y_pred_proba is not None:
2689
+ validation_scores = cal_metrics(
2690
+ y_true,
2691
+ y_pred,
2692
+ y_pred_proba=y_pred_proba,
2693
+ is_binary=is_binary,
2694
+ purpose=purpose,
2695
+ average="weighted",
2696
+ )
2697
+ if is_binary:
2698
+ # Calculate ROC curve
2699
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
2700
+ if y_pred_proba is not None:
2701
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2702
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2703
+ lower_ci, upper_ci = cal_auc_ci(
2704
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2705
+ )
2706
+ roc_auc = auc(fpr, tpr)
2707
+ roc_info = {
2708
+ "fpr": fpr.tolist(),
2709
+ "tpr": tpr.tolist(),
2710
+ "auc": roc_auc,
2711
+ "ci95": (lower_ci, upper_ci),
2712
+ }
2713
+ # precision-recall curve
2714
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
2715
+ avg_precision_ = average_precision_score(y_true, y_pred_proba)
2716
+ pr_info = {
2717
+ "precision": precision_,
2718
+ "recall": recall_,
2719
+ "avg_precision": avg_precision_,
2720
+ }
2721
+ else:
2722
+ roc_info, pr_info = None, None
2723
+ if purpose == "classification":
2724
+ results[model] = {
2725
+ # "best_clf": gs.best_estimator_,
2726
+ # "best_params": gs.best_params_,
2727
+ # "auc_indiv": [
2728
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2729
+ # for i in range(cv_folds)
2730
+ # ],
2731
+ "scores": validation_scores,
2732
+ "roc_curve": roc_info,
2733
+ "pr_curve": pr_info,
2734
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2735
+ "predictions": y_pred,#.tolist(),
2736
+ "predictions_proba": (
2737
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2738
+ ),
2739
+ "features":features,
2740
+ # "coef":coef_,
2741
+ # "alphas":alphas_
2742
+ }
2743
+ else: # "regression"
2744
+ results[model] = {
2745
+ # "best_clf": gs.best_estimator_,
2746
+ # "best_params": gs.best_params_,
2747
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2748
+ "predictions": y_pred,#.tolist(),
2749
+ "predictions_proba": (
2750
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2751
+ ),
2752
+ "features":features,
2753
+ # "coef":coef_,
2754
+ # "alphas":alphas_
2755
+ }
2756
+ else: # multi-classes
2757
+ if y_pred_proba is not None:
2758
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2759
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2760
+ confidence_intervals = cal_auc_ci(
2761
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2762
+ )
2763
+ roc_info = {
2764
+ "fpr": validation_scores["fpr"],
2765
+ "tpr": validation_scores["tpr"],
2766
+ "auc": validation_scores["roc_auc_by_class"],
2767
+ "ci95": confidence_intervals,
2768
+ }
2769
+ # precision-recall curve
2770
+ precision_, recall_, avg_precision_ = cal_precision_recall(
2771
+ y_true, y_pred_proba, is_binary=is_binary
2772
+ )
2773
+ pr_info = {
2774
+ "precision": precision_,
2775
+ "recall": recall_,
2776
+ "avg_precision": avg_precision_,
2777
+ }
2778
+ else:
2779
+ roc_info, pr_info = None, None
2780
+
2781
+ if purpose == "classification":
2782
+ results[model] = {
2783
+ # "best_clf": gs.best_estimator_,
2784
+ # "best_params": gs.best_params_,
2785
+ # "auc_indiv": [
2786
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2787
+ # for i in range(cv_folds)
2788
+ # ],
2789
+ "scores": validation_scores,
2790
+ "roc_curve": roc_info,
2791
+ "pr_curve": pr_info,
2792
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2793
+ "predictions": y_pred,#.tolist(),
2794
+ "predictions_proba": (
2795
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2796
+ ),
2797
+ "features":features,
2798
+ # "coef":coef_,
2799
+ # "alphas":alphas_
2800
+ }
2801
+ else: # "regression"
2802
+ results[model] = {
2803
+ # "best_clf": gs.best_estimator_,
2804
+ # "best_params": gs.best_params_,
2805
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2806
+ "predictions": y_pred,#.tolist(),
2807
+ "predictions_proba": (
2808
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2809
+ ),
2810
+ "features":features,
2811
+ # "coef":coef_,
2812
+ # "alphas":alphas_
2813
+ }
2534
2814
 
2535
- # Hyperparameter grids for tuning
2536
- param_grid_common_xgb = {
2537
- 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2538
- 'max_depth': [3, 5, 7, 10],
2539
- 'n_estimators': [50, 100, 200, 300],
2540
- 'subsample': [0.6, 0.8, 1.0],
2541
- 'colsample_bytree': [0.6, 0.8, 1.0],
2542
- 'gamma': [0, 0.1, 0.2, 0.5],
2543
- 'min_child_weight': [1, 5, 10],
2544
- 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2545
- 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2815
+ else:
2816
+ if y_true is None:
2817
+ validation_scores = []
2818
+ else:
2819
+ validation_scores = cal_metrics(
2820
+ y_true,
2821
+ y_pred,
2822
+ y_pred_proba=y_pred_proba,
2823
+ is_binary=is_binary,
2824
+ purpose=purpose,
2825
+ average="weighted",
2826
+ )
2827
+ results[model] = {
2828
+ # "best_clf": gs.best_estimator_,
2829
+ # "best_params": gs.best_params_,
2830
+ "scores": validation_scores,
2831
+ "predictions": y_pred,#.tolist(),
2832
+ "predictions_proba": (
2833
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2834
+ ),
2835
+ "features":features,
2836
+ "y_train": y_train if y_train is not None else [],
2837
+ "y_true": y_true if y_true is not None else [],
2838
+ # "coef":coef_,
2839
+ # "alphas":alphas_
2546
2840
  }
2841
+ df_results = pd.DataFrame.from_dict(results, orient="index")
2842
+ gs['res']=df_results
2843
+
2844
+ if all([plot_, y_true is not None, purpose == "classification"]):
2845
+ from datetime import datetime
2846
+
2847
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
2848
+ # try:
2849
+ if df_results.shape[0] > 3:
2850
+ try:
2851
+ plot_validate_features(df_results, is_binary=is_binary)
2852
+ except Exception as e:
2853
+ print(e)
2854
+ else:
2855
+ try:
2856
+ plot_validate_features_single(df_results, is_binary=is_binary)
2857
+ except Exception as e:
2858
+ print(e)
2859
+ if dir_save:
2860
+ ips.figsave(dir_save + f"validate_features{now_}.pdf")
2861
+ return gs
2862
+
2863
+ #! cross_valid
2547
2864
  if cv_level in ["low", "simple", "s", "l"]:
2548
2865
  param_grids = {
2549
2866
  "Random Forest": (
@@ -2712,7 +3029,73 @@ def predict(
2712
3029
  'alpha': [0.1],
2713
3030
  'max_iter': [100],},
2714
3031
  "Poisson":{'alpha': [0.1],
2715
- 'max_iter': [100],}
3032
+ 'max_iter': [100],},
3033
+ "Lars": {"n_nonzero_coefs": [10, 50, None]},
3034
+ "LassoLars": {
3035
+ "alpha": [0.01, 0.1, 1]
3036
+ },
3037
+ "BayesianRidge": {
3038
+ "alpha_1": [1e-6, 1e-4, 1e-2],
3039
+ "lambda_1": [1e-6, 1e-4, 1e-2]
3040
+ },
3041
+ "GammaRegressor": {
3042
+ "alpha": [0.1, 1, 10]
3043
+ },
3044
+ "TweedieRegressor": {
3045
+ "alpha": [0.1, 1, 10],
3046
+ "power": [1, 1.5, 2]
3047
+ },
3048
+ "LassoCV": {
3049
+ "cv": [5]
3050
+ },
3051
+ "ElasticNetCV": {
3052
+ "l1_ratio": [0.2, 0.5, 0.8],
3053
+ "cv": [5]
3054
+ },
3055
+ "LassoLarsCV": {
3056
+ "cv": [5]
3057
+ },
3058
+ "LarsCV": {
3059
+ "cv": [5]
3060
+ },
3061
+ "OrthogonalMatchingPursuit": {
3062
+ "n_nonzero_coefs": [10, 50, None]
3063
+ },
3064
+ "OrthogonalMatchingPursuitCV": {
3065
+ "cv": [5]
3066
+ },
3067
+ "PassiveAggressiveRegressor": {
3068
+ "C": [0.1, 1, 10]
3069
+ },
3070
+ "LinearSVR": {
3071
+ "C": [0.1, 1, 10]
3072
+ },
3073
+ "NuSVR": {
3074
+ "C": [0.1, 1, 10]
3075
+ },
3076
+ "DecisionTreeRegressor": {
3077
+ "max_depth": [5, 10, None]
3078
+ },
3079
+ "ExtraTreeRegressor": {
3080
+ "max_depth": [5, 10, None]
3081
+ },
3082
+ "HistGradientBoostingRegressor": {
3083
+ "learning_rate": [0.05, 0.1, 0.2],
3084
+ "max_depth": [5, 10, None]
3085
+ },
3086
+ "GaussianProcessRegressor": {
3087
+ "alpha": [1e-5, 1e-2, 0.1]
3088
+ },
3089
+ "KernelRidge": {
3090
+ "alpha": [0.1, 1, 10],
3091
+ "kernel": ["linear", "rbf"]
3092
+ },
3093
+ "DummyRegressor": {
3094
+ "strategy": ["mean", "median"]
3095
+ },
3096
+ "TransformedTargetRegressor": {
3097
+ "regressor__fit_intercept": [True, False]
3098
+ }
2716
3099
  }
2717
3100
  elif cv_level in ["high", "advanced", "h"]:
2718
3101
  param_grids = {
@@ -2917,7 +3300,96 @@ def predict(
2917
3300
  'alpha': [0.1, 1.0, 10.0],
2918
3301
  'max_iter': [100, 200, 300],},
2919
3302
  "Poisson":{'alpha': [0.1, 1.0, 10.0],
2920
- 'max_iter': [100, 200, 300],}
3303
+ 'max_iter': [100, 200, 300],},
3304
+ "Lars": {
3305
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3306
+ },
3307
+ "LassoLars": {
3308
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3309
+ },
3310
+ "BayesianRidge": {
3311
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3312
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3313
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3314
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3315
+ },
3316
+ "GammaRegressor": {
3317
+ "alpha": [0.01, 0.1, 1, 10],
3318
+ "max_iter": [1000, 5000, 10000]
3319
+ },
3320
+ "TweedieRegressor": {
3321
+ "alpha": [0.01, 0.1, 1, 10],
3322
+ "power": [0, 1, 1.5, 2, 3]
3323
+ },
3324
+ "LassoCV": {
3325
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3326
+ "cv": [3, 5, 10]
3327
+ },
3328
+ "ElasticNetCV": {
3329
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3330
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3331
+ "cv": [3, 5, 10]
3332
+ },
3333
+ "LassoLarsCV": {
3334
+ "cv": [3, 5, 10]
3335
+ },
3336
+ "LarsCV": {
3337
+ "cv": [3, 5, 10]
3338
+ },
3339
+ "OrthogonalMatchingPursuit": {
3340
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3341
+ },
3342
+ "OrthogonalMatchingPursuitCV": {
3343
+ "cv": [3, 5, 10]
3344
+ },
3345
+ "PassiveAggressiveRegressor": {
3346
+ "C": [0.01, 0.1, 1, 10],
3347
+ "max_iter": [1000, 5000, 10000],
3348
+ "early_stopping": [True, False]
3349
+ },
3350
+ "LinearSVR": {
3351
+ "C": [0.01, 0.1, 1, 10],
3352
+ "epsilon": [0.01, 0.1, 1],
3353
+ "max_iter": [1000, 5000, 10000]
3354
+ },
3355
+ "NuSVR": {
3356
+ "C": [0.01, 0.1, 1, 10],
3357
+ "nu": [0.25, 0.5, 0.75],
3358
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3359
+ },
3360
+ "DecisionTreeRegressor": {
3361
+ "max_depth": [None, 5, 10, 20],
3362
+ "min_samples_split": [2, 5, 10],
3363
+ "min_samples_leaf": [1, 2, 4]
3364
+ },
3365
+ "ExtraTreeRegressor": {
3366
+ "max_depth": [None, 5, 10, 20],
3367
+ "min_samples_split": [2, 5, 10],
3368
+ "min_samples_leaf": [1, 2, 4]
3369
+ },
3370
+ "HistGradientBoostingRegressor": {
3371
+ "learning_rate": [0.01, 0.1, 0.2],
3372
+ "max_iter": [100, 500, 1000],
3373
+ "max_depth": [None, 5, 10, 20],
3374
+ "min_samples_leaf": [1, 2, 4]
3375
+ },
3376
+ "GaussianProcessRegressor": {
3377
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3378
+ "n_restarts_optimizer": [0, 1, 5, 10]
3379
+ },
3380
+ "KernelRidge": {
3381
+ "alpha": [0.01, 0.1, 1, 10],
3382
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3383
+ "degree": [2, 3, 4]
3384
+ },
3385
+ "DummyRegressor": {
3386
+ "strategy": ["mean", "median", "constant"],
3387
+ "constant": [0] # Only if strategy is 'constant'
3388
+ },
3389
+ "TransformedTargetRegressor": {
3390
+ # Grid for the underlying regressor, example shown for LinearRegression
3391
+ "regressor__fit_intercept": [True, False]
3392
+ }
2921
3393
  }
2922
3394
  else: # median level
2923
3395
  param_grids = {
@@ -3164,7 +3636,96 @@ def predict(
3164
3636
  'alpha': [0.1, 1.0],
3165
3637
  'max_iter': [100, 200],},
3166
3638
  "Poisson":{'alpha': [0.1, 1.0],
3167
- 'max_iter': [100, 200],}
3639
+ 'max_iter': [100, 200],},
3640
+ "Lars": {
3641
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3642
+ },
3643
+ "LassoLars": {
3644
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3645
+ },
3646
+ "BayesianRidge": {
3647
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3648
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3649
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3650
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3651
+ },
3652
+ "GammaRegressor": {
3653
+ "alpha": [0.01, 0.1, 1, 10],
3654
+ "max_iter": [1000, 5000, 10000]
3655
+ },
3656
+ "TweedieRegressor": {
3657
+ "alpha": [0.01, 0.1, 1, 10],
3658
+ "power": [0, 1, 1.5, 2, 3]
3659
+ },
3660
+ "LassoCV": {
3661
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3662
+ "cv": [3, 5, 10]
3663
+ },
3664
+ "ElasticNetCV": {
3665
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3666
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3667
+ "cv": [3, 5, 10]
3668
+ },
3669
+ "LassoLarsCV": {
3670
+ "cv": [3, 5, 10]
3671
+ },
3672
+ "LarsCV": {
3673
+ "cv": [3, 5, 10]
3674
+ },
3675
+ "OrthogonalMatchingPursuit": {
3676
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3677
+ },
3678
+ "OrthogonalMatchingPursuitCV": {
3679
+ "cv": [3, 5, 10]
3680
+ },
3681
+ "PassiveAggressiveRegressor": {
3682
+ "C": [0.01, 0.1, 1, 10],
3683
+ "max_iter": [1000, 5000, 10000],
3684
+ "early_stopping": [True, False]
3685
+ },
3686
+ "LinearSVR": {
3687
+ "C": [0.01, 0.1, 1, 10],
3688
+ "epsilon": [0.01, 0.1, 1],
3689
+ "max_iter": [1000, 5000, 10000]
3690
+ },
3691
+ "NuSVR": {
3692
+ "C": [0.01, 0.1, 1, 10],
3693
+ "nu": [0.25, 0.5, 0.75],
3694
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3695
+ },
3696
+ "DecisionTreeRegressor": {
3697
+ "max_depth": [None, 5, 10, 20],
3698
+ "min_samples_split": [2, 5, 10],
3699
+ "min_samples_leaf": [1, 2, 4]
3700
+ },
3701
+ "ExtraTreeRegressor": {
3702
+ "max_depth": [None, 5, 10, 20],
3703
+ "min_samples_split": [2, 5, 10],
3704
+ "min_samples_leaf": [1, 2, 4]
3705
+ },
3706
+ "HistGradientBoostingRegressor": {
3707
+ "learning_rate": [0.01, 0.1, 0.2],
3708
+ "max_iter": [100, 500, 1000],
3709
+ "max_depth": [None, 5, 10, 20],
3710
+ "min_samples_leaf": [1, 2, 4]
3711
+ },
3712
+ "GaussianProcessRegressor": {
3713
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3714
+ "n_restarts_optimizer": [0, 1, 5, 10]
3715
+ },
3716
+ "KernelRidge": {
3717
+ "alpha": [0.01, 0.1, 1, 10],
3718
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3719
+ "degree": [2, 3, 4]
3720
+ },
3721
+ "DummyRegressor": {
3722
+ "strategy": ["mean", "median", "constant"],
3723
+ "constant": [0] # Only if strategy is 'constant'
3724
+ },
3725
+ "TransformedTargetRegressor": {
3726
+ # Grid for the underlying regressor, example shown for LinearRegression
3727
+ "regressor__fit_intercept": [True, False]
3728
+ }
3168
3729
  }
3169
3730
 
3170
3731
  results = {}
@@ -3174,7 +3735,7 @@ def predict(
3174
3735
  if purpose == "classification"
3175
3736
  else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
3176
3737
  )
3177
-
3738
+
3178
3739
  # Train and validate each model
3179
3740
  for name, clf in tqdm(
3180
3741
  models.items(),
@@ -3185,8 +3746,7 @@ def predict(
3185
3746
  if verbose:
3186
3747
  print(f"\nTraining and validating {name}:")
3187
3748
  try:
3188
- # Grid search with KFold or StratifiedKFold
3189
- if is_binary:
3749
+ if is_binary:
3190
3750
  gs = GridSearchCV(
3191
3751
  clf,
3192
3752
  param_grid=param_grids.get(name, {}),
@@ -3202,6 +3762,7 @@ def predict(
3202
3762
 
3203
3763
  gs.fit(x_train, y_train)
3204
3764
  best_clf = gs.best_estimator_
3765
+
3205
3766
  # make sure x_train and x_test has the same name
3206
3767
  x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3207
3768
  y_pred = best_clf.predict(x_true)
@@ -3212,7 +3773,14 @@ def predict(
3212
3773
  y_pred_proba = np.hstack(
3213
3774
  [1 - y_pred_proba, y_pred_proba]
3214
3775
  ) # Add missing class probabilities
3215
- y_pred_proba = y_pred_proba[:, 1]
3776
+ if y_pred_proba.shape[1] == 2:
3777
+ if isinstance(y_pred_proba, pd.DataFrame):
3778
+ y_pred_proba = y_pred_proba.iloc[:, 1]
3779
+ elif isinstance(y_pred_proba, pd.Series):
3780
+ y_pred_proba = y_pred_proba.values[:, 1]
3781
+ else:
3782
+ y_pred_proba = y_pred_proba[:, 1]
3783
+
3216
3784
  elif hasattr(best_clf, "decision_function"):
3217
3785
  # If predict_proba is not available, use decision_function (e.g., for SVM)
3218
3786
  y_pred_proba = best_clf.decision_function(x_true)
@@ -3233,7 +3801,7 @@ def predict(
3233
3801
  else:
3234
3802
  alphas_= None
3235
3803
  coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3236
- else:
3804
+ else:
3237
3805
  gs = GridSearchCV(
3238
3806
  clf,
3239
3807
  param_grid=param_grids.get(name, {}),
@@ -3289,6 +3857,7 @@ def predict(
3289
3857
  alphas_,coef_ = None,None
3290
3858
  print(f"skiped {clf}: {e}")
3291
3859
  continue
3860
+
3292
3861
  # try to make predict format consistant
3293
3862
  try:
3294
3863
  y_pred= [i[0] for i in y_pred]
@@ -3460,7 +4029,8 @@ def predict(
3460
4029
 
3461
4030
  # Convert results to DataFrame
3462
4031
  df_results = pd.DataFrame.from_dict(results, orient="index")
3463
- # sort
4032
+ display(df_results)
4033
+ # sort
3464
4034
  if y_true is not None:
3465
4035
  if purpose == "classification":
3466
4036
  df_scores = pd.DataFrame(
@@ -4028,6 +4598,8 @@ def predict(
4028
4598
  df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
4029
4599
  elif stack:
4030
4600
  df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
4601
+ else:
4602
+ df_res=df_results
4031
4603
 
4032
4604
  if all([plot_, y_true is not None, purpose == "classification"]):
4033
4605
  from datetime import datetime