py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
31
31
  average_precision_score,
32
32
  )
33
33
  from typing import Dict, Any, Optional, List, Union
34
+ import os, json
34
35
  import numpy as np
35
36
  import pandas as pd
36
37
  from . import ips
@@ -49,7 +50,13 @@ logger = logging.getLogger()
49
50
  warnings.filterwarnings("ignore", category=UserWarning)
50
51
  from sklearn.tree import DecisionTreeClassifier
51
52
  from sklearn.neighbors import KNeighborsClassifier
52
-
53
+ #* set random_state global
54
+ import torch
55
+ import random
56
+ random_state=1
57
+ random.seed(random_state)
58
+ np.random.seed(random_state)
59
+ torch.manual_seed(random_state)
53
60
 
54
61
  def features_knn(
55
62
  x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
@@ -594,7 +601,7 @@ def get_features(
594
601
  """
595
602
  from sklearn.compose import ColumnTransformer
596
603
  from sklearn.preprocessing import StandardScaler, OneHotEncoder
597
-
604
+ from sklearn.model_selection import train_test_split
598
605
  # Ensure X and y are DataFrames/Series for consistency
599
606
  if isinstance(X, np.ndarray):
600
607
  X = pd.DataFrame(X)
@@ -922,10 +929,26 @@ def get_features(
922
929
  "feature_importances": feature_importances,
923
930
  }
924
931
  if all([plot_, dir_save]):
932
+
925
933
  from datetime import datetime
926
-
927
934
  now_ = datetime.now().strftime("%y%m%d_%H%M%S")
928
935
  ips.figsave(dir_save + f"features{now_}.pdf")
936
+
937
+ lists = []
938
+ for tp in ips.flatten(features_df["type"]):
939
+ lists.append(
940
+ features_df
941
+ .loc[features_df["type"] == tp, "feature"]
942
+ .tolist()
943
+ )
944
+ labels = ips.flatten(features_df["type"])
945
+ # current_fig = plt.gcf()
946
+ # # ax = current_fig.add_subplot(3, 2, 6)
947
+ # gs = current_fig.add_gridspec(3, 2)
948
+ # ax = current_fig.add_subplot(gs[:, :])
949
+ plt.figure(figsize=[6,6])
950
+ plot.venn(lists, labels, cmap="coolwarm")
951
+ ips.figsave(dir_save + f"features{now_}shared_features.pdf")
929
952
  else:
930
953
  results = {
931
954
  "selected_features": pd.DataFrame(),
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1247
1270
  nexttile = plot.subplot(figsize=figsize)
1248
1271
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1249
1272
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1250
- fpr = res_val["roc_curve"][model_name]["fpr"]
1251
- tpr = res_val["roc_curve"][model_name]["tpr"]
1252
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1253
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1254
- plot_roc_curve(
1255
- fpr,
1256
- tpr,
1257
- mean_auc,
1258
- lower_ci,
1259
- upper_ci,
1260
- model_name=model_name,
1261
- lw=1.5,
1262
- color=colors[i],
1263
- alpha=alpha,
1264
- ax=ax,
1265
- )
1273
+ try:
1274
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1275
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1276
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1277
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1278
+ plot_roc_curve(
1279
+ fpr,
1280
+ tpr,
1281
+ mean_auc,
1282
+ lower_ci,
1283
+ upper_ci,
1284
+ model_name=model_name,
1285
+ lw=1.5,
1286
+ color=colors[i],
1287
+ alpha=alpha,
1288
+ ax=ax,
1289
+ )
1290
+ except Exception as e:
1291
+ print(e)
1266
1292
  plot.figsets(
1267
1293
  sp=2,
1268
1294
  legend=dict(
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1277
1303
 
1278
1304
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1279
1305
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1280
- plot_pr_curve(
1281
- recall=res_val["pr_curve"][model_name]["recall"],
1282
- precision=res_val["pr_curve"][model_name]["precision"],
1283
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1284
- model_name=model_name,
1285
- color=colors[i],
1286
- lw=1.5,
1287
- alpha=alpha,
1288
- ax=ax,
1289
- )
1306
+ try:
1307
+ plot_pr_curve(
1308
+ recall=res_val["pr_curve"][model_name]["recall"],
1309
+ precision=res_val["pr_curve"][model_name]["precision"],
1310
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1311
+ model_name=model_name,
1312
+ color=colors[i],
1313
+ lw=1.5,
1314
+ alpha=alpha,
1315
+ ax=ax,
1316
+ )
1317
+ except Exception as e:
1318
+ print(e)
1290
1319
  plot.figsets(
1291
1320
  sp=2,
1292
1321
  legend=dict(
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1314
1343
  for iclass, class_ in enumerate(classes):
1315
1344
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1316
1345
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1317
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1318
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1319
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1320
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1321
- plot_roc_curve(
1322
- fpr,
1323
- tpr,
1324
- mean_auc,
1325
- lower_ci,
1326
- upper_ci,
1327
- model_name=model_name,
1328
- lw=1.5,
1329
- color=colors[i],
1330
- alpha=alpha,
1331
- ax=ax,
1332
- )
1346
+ try:
1347
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1348
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1349
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1350
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1351
+ plot_roc_curve(
1352
+ fpr,
1353
+ tpr,
1354
+ mean_auc,
1355
+ lower_ci,
1356
+ upper_ci,
1357
+ model_name=model_name,
1358
+ lw=1.5,
1359
+ color=colors[i],
1360
+ alpha=alpha,
1361
+ ax=ax,
1362
+ )
1363
+ except Exception as e:
1364
+ print(e)
1333
1365
  plot.figsets(
1334
1366
  sp=2,
1335
1367
  title=class_,
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
1345
1377
 
1346
1378
  ax = nexttile(subplot_layout[0], subplot_layout[1])
1347
1379
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1348
- plot_pr_curve(
1349
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1350
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1351
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1352
- iclass
1353
- ],
1354
- model_name=model_name,
1355
- color=colors[i],
1356
- lw=1.5,
1357
- alpha=alpha,
1358
- ax=ax,
1359
- )
1380
+ try:
1381
+ plot_pr_curve(
1382
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1383
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1384
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1385
+ iclass
1386
+ ],
1387
+ model_name=model_name,
1388
+ color=colors[i],
1389
+ lw=1.5,
1390
+ alpha=alpha,
1391
+ ax=ax,
1392
+ )
1393
+ except Exception as e:
1394
+ print(e)
1360
1395
  plot.figsets(
1361
1396
  sp=2,
1362
1397
  title=class_,
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1379
1414
  len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
1380
1415
  )
1381
1416
  for model_name in ips.flatten(res_val["pr_curve"].index):
1382
- fpr = res_val["roc_curve"][model_name]["fpr"]
1383
- tpr = res_val["roc_curve"][model_name]["tpr"]
1384
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1385
- mean_auc = res_val["roc_curve"][model_name]["auc"]
1386
-
1387
- # Plotting
1388
- plot_roc_curve(
1389
- fpr,
1390
- tpr,
1391
- mean_auc,
1392
- lower_ci,
1393
- upper_ci,
1394
- model_name=model_name,
1395
- ax=nexttile(),
1396
- )
1397
- plot.figsets(title=model_name, sp=2)
1417
+ try:
1418
+ fpr = res_val["roc_curve"][model_name]["fpr"]
1419
+ tpr = res_val["roc_curve"][model_name]["tpr"]
1420
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
1421
+ mean_auc = res_val["roc_curve"][model_name]["auc"]
1398
1422
 
1399
- plot_pr_binary(
1400
- recall=res_val["pr_curve"][model_name]["recall"],
1401
- precision=res_val["pr_curve"][model_name]["precision"],
1402
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1403
- model_name=model_name,
1404
- ax=nexttile(),
1405
- )
1406
- plot.figsets(title=model_name, sp=2)
1423
+ # Plotting
1424
+ plot_roc_curve(
1425
+ fpr,
1426
+ tpr,
1427
+ mean_auc,
1428
+ lower_ci,
1429
+ upper_ci,
1430
+ model_name=model_name,
1431
+ ax=nexttile(),
1432
+ )
1433
+ plot.figsets(title=model_name, sp=2)
1407
1434
 
1408
- # plot cm
1409
- plot_cm(
1410
- res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1411
- )
1412
- plot.figsets(title=model_name, sp=2)
1435
+ plot_pr_binary(
1436
+ recall=res_val["pr_curve"][model_name]["recall"],
1437
+ precision=res_val["pr_curve"][model_name]["precision"],
1438
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
1439
+ model_name=model_name,
1440
+ ax=nexttile(),
1441
+ )
1442
+ plot.figsets(title=model_name, sp=2)
1443
+
1444
+ # plot cm
1445
+ plot_cm(
1446
+ res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
1447
+ )
1448
+ plot.figsets(title=model_name, sp=2)
1449
+
1450
+ except Exception as e:
1451
+ print(e)
1413
1452
  else:
1414
1453
 
1415
1454
  modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1424
1463
  for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
1425
1464
  ax = nexttile()
1426
1465
  for iclass, class_ in enumerate(classes):
1427
- fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1428
- tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1429
- (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1430
- mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1431
- plot_roc_curve(
1432
- fpr,
1433
- tpr,
1434
- mean_auc,
1435
- lower_ci,
1436
- upper_ci,
1437
- model_name=class_,
1438
- lw=1.5,
1439
- color=colors[iclass],
1440
- alpha=0.03,
1441
- ax=ax,
1442
- )
1466
+ try:
1467
+ fpr = res_val["roc_curve"][model_name]["fpr"][class_]
1468
+ tpr = res_val["roc_curve"][model_name]["tpr"][class_]
1469
+ (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
1470
+ mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
1471
+ plot_roc_curve(
1472
+ fpr,
1473
+ tpr,
1474
+ mean_auc,
1475
+ lower_ci,
1476
+ upper_ci,
1477
+ model_name=class_,
1478
+ lw=1.5,
1479
+ color=colors[iclass],
1480
+ alpha=0.03,
1481
+ ax=ax,
1482
+ )
1483
+ except Exception as e:
1484
+ print(e)
1443
1485
  plot.figsets(
1444
1486
  sp=2,
1445
1487
  title=model_name,
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
1451
1493
 
1452
1494
  ax = nexttile()
1453
1495
  for iclass, class_ in enumerate(classes):
1454
- plot_pr_curve(
1455
- recall=res_val["pr_curve"][model_name]["recall"][iclass],
1456
- precision=res_val["pr_curve"][model_name]["precision"][iclass],
1457
- avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1458
- iclass
1459
- ],
1460
- model_name=class_,
1461
- color=colors[iclass],
1462
- lw=1.5,
1463
- alpha=0.03,
1464
- ax=ax,
1465
- )
1496
+ try:
1497
+ plot_pr_curve(
1498
+ recall=res_val["pr_curve"][model_name]["recall"][iclass],
1499
+ precision=res_val["pr_curve"][model_name]["precision"][iclass],
1500
+ avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
1501
+ iclass
1502
+ ],
1503
+ model_name=class_,
1504
+ color=colors[iclass],
1505
+ lw=1.5,
1506
+ alpha=0.03,
1507
+ ax=ax,
1508
+ )
1509
+ except Exception as e:
1510
+ print(e)
1466
1511
  plot.figsets(
1467
1512
  sp=2,
1468
1513
  title=class_,
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
1543
1588
  # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
1544
1589
  sorted_scores = np.array(bootstrapped_scores)
1545
1590
  sorted_scores.sort()
1546
-
1547
- # Computing the lower and upper bound of the 90% confidence interval
1548
- # You can change the bounds percentiles to 0.025 and 0.975 to get
1549
- # a 95% confidence interval instead.
1591
+
1550
1592
  confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1551
1593
  confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1552
1594
  if verbose:
1553
1595
  print(
1554
- "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
1596
+ "Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
1555
1597
  confidence_lower, confidence_upper
1556
1598
  )
1557
1599
  )
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
1568
1610
  y_true, classes=np.unique(y_true)
1569
1611
  ) # One-vs-Rest transformation
1570
1612
  n_classes = y_true_bin.shape[1] # Number of classes
1571
-
1572
- bootstrapped_scores = np.zeros(
1573
- (n_classes, n_bootstraps)
1574
- ) # Store scores for each class
1575
-
1613
+
1614
+ bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
1576
1615
  if verbose:
1577
1616
  print("AUROC scores for each class:")
1578
1617
  for i in range(n_classes):
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
1592
1631
  # Calculating the confidence intervals for each class
1593
1632
  confidence_intervals = []
1594
1633
  for class_idx in range(n_classes):
1595
- sorted_scores = np.sort(bootstrapped_scores[class_idx])
1596
- confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1597
- confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1598
- confidence_intervals.append((confidence_lower, confidence_upper))
1599
-
1600
- if verbose:
1601
- print(
1602
- f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1603
- )
1634
+ # rm nan
1635
+ valid_scores = bootstrapped_scores[class_idx][
1636
+ ~np.isnan(bootstrapped_scores[class_idx])
1637
+ ]
1638
+ if len(valid_scores) > 0:
1639
+ sorted_scores = np.sort(valid_scores)
1640
+ confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
1641
+ confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
1642
+ confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
1643
+
1644
+ if verbose:
1645
+ print(
1646
+ f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
1647
+ )
1648
+ else:
1649
+ confidence_intervals[class_idx] = (np.nan, np.nan)
1650
+ if verbose:
1651
+ print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
1604
1652
 
1605
1653
  return confidence_intervals
1606
1654
 
@@ -2057,20 +2105,20 @@ def rank_models(
2057
2105
 
2058
2106
  def generate_bar_plot(ax, cv_test_scores):
2059
2107
  ax = plot.plotxy(
2060
- y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
2108
+ y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
2061
2109
  )
2062
2110
  plt.title("Classifier Performance")
2063
2111
  plt.tight_layout()
2064
2112
  return plt
2065
2113
 
2066
- nexttile = plot.subplot(2, 2, figsize=[10, 7])
2114
+ nexttile = plot.subplot(2, 2, figsize=[10, 10])
2067
2115
  generate_bar_plot(nexttile(), top_models.dropna())
2068
2116
  plot.radar(
2069
2117
  ax=nexttile(projection="polar"),
2070
2118
  data=cv_test_scores.set_index("Classifier"),
2071
- ylim=[0.5, 1],
2072
- color=plot.get_color(10),
2073
- alpha=0.05,
2119
+ ylim=[0, 1],
2120
+ color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
2121
+ alpha=0.02,
2074
2122
  circular=1,
2075
2123
  )
2076
2124
  return cv_test_scores
@@ -2216,7 +2264,7 @@ def predict(
2216
2264
  metrics: Optional[List[str]] = None,
2217
2265
  stack:bool=True,# run stacking
2218
2266
  stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
2219
- vote:bool=True,# run voting
2267
+ vote:bool=False,# run voting
2220
2268
  voting:str="hard", # only for classification purporse of voting
2221
2269
  n_top_models:int=5, #for stacking models
2222
2270
  n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
@@ -2229,7 +2277,12 @@ def predict(
2229
2277
  cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
2230
2278
  class_weight: str = "balanced",
2231
2279
  random_state: int = 1,
2280
+ presets = "best_quality",# specific for autogluon
2281
+ time_limit=600, # specific for autogluon
2282
+ num_bag_folds=5, # specific for autogluon
2283
+ num_stack_levels=2, # specific for autogluon
2232
2284
  verbose: bool = False,
2285
+ **kwargs
2233
2286
  ) -> pd.DataFrame:
2234
2287
  """
2235
2288
  第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
@@ -2280,28 +2333,20 @@ def predict(
2280
2333
  RandomForestRegressor,
2281
2334
  ExtraTreesClassifier,
2282
2335
  ExtraTreesRegressor,
2336
+ HistGradientBoostingRegressor,
2283
2337
  BaggingClassifier,
2284
2338
  BaggingRegressor,
2285
2339
  AdaBoostClassifier,
2286
2340
  AdaBoostRegressor,
2287
2341
  )
2288
- from sklearn.svm import SVC, SVR
2289
- from sklearn.tree import DecisionTreeRegressor
2342
+ from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
2343
+ from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
2290
2344
  from sklearn.linear_model import (
2291
- LogisticRegression,
2292
- ElasticNet,
2293
- ElasticNetCV,
2294
- LinearRegression,
2295
- Lasso,
2296
- RidgeClassifierCV,
2297
- Perceptron,
2298
- SGDClassifier,
2299
- RidgeCV,
2300
- Ridge,
2301
- TheilSenRegressor,
2302
- HuberRegressor,
2303
- PoissonRegressor,
2304
-
2345
+ LogisticRegression,ElasticNet,ElasticNetCV,
2346
+ LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
2347
+ RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
2348
+ GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
2349
+ OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
2305
2350
  )
2306
2351
  from sklearn.compose import TransformedTargetRegressor
2307
2352
  from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
@@ -2318,10 +2363,16 @@ def predict(
2318
2363
  )
2319
2364
  from sklearn.preprocessing import PolynomialFeatures
2320
2365
  from sklearn.model_selection import train_test_split
2321
-
2366
+
2367
+ from sklearn.gaussian_process import GaussianProcessRegressor
2368
+ from sklearn.kernel_ridge import KernelRidge
2369
+ from sklearn.dummy import DummyRegressor
2370
+ from autogluon.tabular import TabularPredictor
2322
2371
  # 拼写检查
2323
2372
  purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
2324
2373
  print(f"{purpose} processing...")
2374
+
2375
+
2325
2376
  # Default models or regressors if not provided
2326
2377
  if purpose == "classification":
2327
2378
  model_ = {
@@ -2374,27 +2425,44 @@ def predict(
2374
2425
  model_ = {
2375
2426
  "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
2376
2427
  "SVM": SVR(), # SVR (Support Vector Regression)
2377
- # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
2378
- "LassoCV": LassoCV(
2379
- cv=cv_folds, random_state=random_state,n_jobs=n_jobs
2380
- ), # LassoCV自动找出最适alpha,优于Lasso
2428
+ "LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
2381
2429
  "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
2382
2430
  "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
2383
2431
  "Linear Regression": LinearRegression(n_jobs=n_jobs),
2384
2432
  "AdaBoost": AdaBoostRegressor(random_state=random_state),
2385
- "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
2386
- force_row_wise=True # Or use force_col_wise=True if memory is a concern
2387
- ),
2433
+ "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
2388
2434
  "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
2389
2435
  "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
2390
2436
  "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
2391
2437
  "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
2392
2438
  "ElasticNet": ElasticNet(random_state=random_state),
2393
- "Ridge": Ridge(),
2439
+ "Ridge": Ridge(random_state=random_state),
2394
2440
  "KNN": KNeighborsRegressor(n_jobs=n_jobs),
2395
2441
  "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
2396
2442
  "Huber":HuberRegressor(),
2397
- "Poisson":PoissonRegressor()
2443
+ "Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
2444
+ "Lasso": Lasso(random_state=random_state),
2445
+ "Lars": Lars(),
2446
+ "LassoLars": LassoLars(),
2447
+ "BayesianRidge": BayesianRidge(),
2448
+ "GammaRegressor": GammaRegressor(),
2449
+ "TweedieRegressor": TweedieRegressor(),
2450
+ "LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
2451
+ "ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
2452
+ "LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
2453
+ "LarsCV": LarsCV(),
2454
+ "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
2455
+ "OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
2456
+ "PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
2457
+ "LinearSVR": LinearSVR(random_state=random_state),
2458
+ "NuSVR": NuSVR(),
2459
+ "DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
2460
+ "ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
2461
+ "HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
2462
+ "GaussianProcessRegressor": GaussianProcessRegressor(),
2463
+ "KernelRidge": KernelRidge(),
2464
+ "DummyRegressor": DummyRegressor(),
2465
+ "TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
2398
2466
  }
2399
2467
  if cls is None:
2400
2468
  models = model_
@@ -2411,10 +2479,17 @@ def predict(
2411
2479
  ips.df_special_characters_cleaner(x_true) if x_true is not None else None
2412
2480
  )
2413
2481
 
2482
+ # only keep "autogluon_tab" in models
2483
+ cls = [cls] if isinstance(cls, str) else cls
2484
+
2485
+ if cls is not None:
2486
+ models={"autogluon_tab":None} if "auto" in cls else models
2487
+
2414
2488
  # indicate cls:
2415
2489
  if ips.run_once_within(30): # 10 min
2416
2490
  print(f"processing: {list(models.keys())}")
2417
- print(isinstance(y_train, str) and y_train in x_train.columns)
2491
+ y_train_col_name=None
2492
+ # print(isinstance(y_train, str) and y_train in x_train.columns)
2418
2493
  if isinstance(y_train, str) and y_train in x_train.columns:
2419
2494
  y_train_col_name = y_train
2420
2495
  y_train = x_train[y_train]
@@ -2531,19 +2606,261 @@ def predict(
2531
2606
  if isinstance(y_train, np.ndarray):
2532
2607
  y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
2533
2608
  y_true = np.asarray(y_true)
2609
+ #! so far, got the: x_train,x_true,y_train,y_true
2610
+ # Grid search with KFold or StratifiedKFold
2611
+ if "autogluon_tab" in models:
2612
+ # load hypoer_param
2613
+ f_param = os.path.dirname(os.path.abspath(__file__))
2614
+ f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
2615
+ with open(f_param, "r") as file:
2616
+ hyper_param_autogluon = json.load(file)
2617
+ # Train the model with AutoGluon
2618
+ features=x_train.columns.tolist()
2619
+ label= y_train_col_name if y_train_col_name is not None else 'target'
2620
+ df_autogluon = x_train.copy()
2621
+ df_autogluon[label]=y_train
2622
+ autogluon_presets=["best_quality","good_quality","fast_train"]
2623
+ best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
2624
+ train_data=df_autogluon,
2625
+ presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
2626
+ time_limit=time_limit,#3600, # in sec: Limit training time,
2627
+ num_bag_folds=num_bag_folds,
2628
+ num_stack_levels=num_stack_levels,
2629
+ hyperparameters=hyper_param_autogluon,
2630
+ verbosity=1 if verbose else 0,
2631
+ **kwargs
2632
+ )
2633
+ #! Get the leaderboard
2634
+ gs={}
2635
+ # Display the leaderboard for reference
2636
+ leaderboard = best_clf.leaderboard()
2637
+ gs['info']=best_clf.info()
2638
+ # gs["res"]=best_clf
2639
+ gs["features"]=features
2640
+ gs["leaderboard"] = leaderboard
2641
+ best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
2642
+ # Store the best model and its details in the gs dictionary
2643
+ gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
2644
+ gs["best_params_"] = best_model_name # Hyperparameters
2645
+ # Make predictions if x_true is provided
2646
+ if x_true is not None:
2647
+ x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2648
+ gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
2649
+ gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
2650
+ x_true[label]=gs["predictions"]
2651
+ if gs["predictions"].value_counts().shape[0]>1:
2652
+ gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
2653
+ gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
2654
+ all_models = gs["models"]
2655
+ model_evaluations = {}
2656
+ for model in all_models:
2657
+ predictions = best_clf.predict(x_true[features], model=model)
2658
+ evaluation = best_clf.evaluate_predictions(
2659
+ y_true=x_true[label], # True labels
2660
+ y_pred=predictions, # Predictions from the specific model
2661
+ auxiliary_metrics=True, # Include additional metrics if needed
2662
+ )
2663
+ model_evaluations[model] = evaluation
2664
+ gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
2665
+ #! 试着保持一样的格式
2666
+ results = {}
2667
+ for model in all_models:
2668
+ y_pred = best_clf.predict(x_true[features], model=model).tolist()
2669
+ y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
2670
+
2671
+ if isinstance(y_pred_proba, pd.DataFrame):
2672
+ y_pred_proba=y_pred_proba.iloc[:,1]
2673
+
2674
+ # try to make predict format consistant
2675
+ try:
2676
+ y_pred= [i[0] for i in y_pred]
2677
+ except:
2678
+ pass
2679
+ try:
2680
+ y_true= [i[0] for i in y_true]
2681
+ except:
2682
+ pass
2683
+ try:
2684
+ y_train= [i[0] for i in y_train]
2685
+ except:
2686
+ pass
2687
+ validation_scores = {}
2688
+ if y_true is not None and y_pred_proba is not None:
2689
+ validation_scores = cal_metrics(
2690
+ y_true,
2691
+ y_pred,
2692
+ y_pred_proba=y_pred_proba,
2693
+ is_binary=is_binary,
2694
+ purpose=purpose,
2695
+ average="weighted",
2696
+ )
2697
+ if is_binary:
2698
+ # Calculate ROC curve
2699
+ # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
2700
+ if y_pred_proba is not None:
2701
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2702
+ fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2703
+ lower_ci, upper_ci = cal_auc_ci(
2704
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2705
+ )
2706
+ roc_auc = auc(fpr, tpr)
2707
+ roc_info = {
2708
+ "fpr": fpr.tolist(),
2709
+ "tpr": tpr.tolist(),
2710
+ "auc": roc_auc,
2711
+ "ci95": (lower_ci, upper_ci),
2712
+ }
2713
+ # precision-recall curve
2714
+ precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
2715
+ avg_precision_ = average_precision_score(y_true, y_pred_proba)
2716
+ pr_info = {
2717
+ "precision": precision_,
2718
+ "recall": recall_,
2719
+ "avg_precision": avg_precision_,
2720
+ }
2721
+ else:
2722
+ roc_info, pr_info = None, None
2723
+ if purpose == "classification":
2724
+ results[model] = {
2725
+ # "best_clf": gs.best_estimator_,
2726
+ # "best_params": gs.best_params_,
2727
+ # "auc_indiv": [
2728
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2729
+ # for i in range(cv_folds)
2730
+ # ],
2731
+ "scores": validation_scores,
2732
+ "roc_curve": roc_info,
2733
+ "pr_curve": pr_info,
2734
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2735
+ "predictions": y_pred,#.tolist(),
2736
+ "predictions_proba": (
2737
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2738
+ ),
2739
+ "features":features,
2740
+ # "coef":coef_,
2741
+ # "alphas":alphas_
2742
+ }
2743
+ else: # "regression"
2744
+ results[model] = {
2745
+ # "best_clf": gs.best_estimator_,
2746
+ # "best_params": gs.best_params_,
2747
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2748
+ "predictions": y_pred,#.tolist(),
2749
+ "predictions_proba": (
2750
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2751
+ ),
2752
+ "features":features,
2753
+ # "coef":coef_,
2754
+ # "alphas":alphas_
2755
+ }
2756
+ else: # multi-classes
2757
+ if y_pred_proba is not None:
2758
+ # fpr, tpr, roc_auc = dict(), dict(), dict()
2759
+ # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
2760
+ confidence_intervals = cal_auc_ci(
2761
+ y_true, y_pred_proba, verbose=False, is_binary=is_binary
2762
+ )
2763
+ roc_info = {
2764
+ "fpr": validation_scores["fpr"],
2765
+ "tpr": validation_scores["tpr"],
2766
+ "auc": validation_scores["roc_auc_by_class"],
2767
+ "ci95": confidence_intervals,
2768
+ }
2769
+ # precision-recall curve
2770
+ precision_, recall_, avg_precision_ = cal_precision_recall(
2771
+ y_true, y_pred_proba, is_binary=is_binary
2772
+ )
2773
+ pr_info = {
2774
+ "precision": precision_,
2775
+ "recall": recall_,
2776
+ "avg_precision": avg_precision_,
2777
+ }
2778
+ else:
2779
+ roc_info, pr_info = None, None
2780
+
2781
+ if purpose == "classification":
2782
+ results[model] = {
2783
+ # "best_clf": gs.best_estimator_,
2784
+ # "best_params": gs.best_params_,
2785
+ # "auc_indiv": [
2786
+ # gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
2787
+ # for i in range(cv_folds)
2788
+ # ],
2789
+ "scores": validation_scores,
2790
+ "roc_curve": roc_info,
2791
+ "pr_curve": pr_info,
2792
+ "confusion_matrix": confusion_matrix(y_true, y_pred),
2793
+ "predictions": y_pred,#.tolist(),
2794
+ "predictions_proba": (
2795
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2796
+ ),
2797
+ "features":features,
2798
+ # "coef":coef_,
2799
+ # "alphas":alphas_
2800
+ }
2801
+ else: # "regression"
2802
+ results[model] = {
2803
+ # "best_clf": gs.best_estimator_,
2804
+ # "best_params": gs.best_params_,
2805
+ "scores": validation_scores, # e.g., neg_MSE, R², etc.
2806
+ "predictions": y_pred,#.tolist(),
2807
+ "predictions_proba": (
2808
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2809
+ ),
2810
+ "features":features,
2811
+ # "coef":coef_,
2812
+ # "alphas":alphas_
2813
+ }
2534
2814
 
2535
- # Hyperparameter grids for tuning
2536
- param_grid_common_xgb = {
2537
- 'learning_rate': [0.01, 0.1, 0.2, 0.3],
2538
- 'max_depth': [3, 5, 7, 10],
2539
- 'n_estimators': [50, 100, 200, 300],
2540
- 'subsample': [0.6, 0.8, 1.0],
2541
- 'colsample_bytree': [0.6, 0.8, 1.0],
2542
- 'gamma': [0, 0.1, 0.2, 0.5],
2543
- 'min_child_weight': [1, 5, 10],
2544
- 'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization term
2545
- 'reg_lambda': [1, 1.5, 2], # L2 regularization term
2815
+ else:
2816
+ if y_true is None:
2817
+ validation_scores = []
2818
+ else:
2819
+ validation_scores = cal_metrics(
2820
+ y_true,
2821
+ y_pred,
2822
+ y_pred_proba=y_pred_proba,
2823
+ is_binary=is_binary,
2824
+ purpose=purpose,
2825
+ average="weighted",
2826
+ )
2827
+ results[model] = {
2828
+ # "best_clf": gs.best_estimator_,
2829
+ # "best_params": gs.best_params_,
2830
+ "scores": validation_scores,
2831
+ "predictions": y_pred,#.tolist(),
2832
+ "predictions_proba": (
2833
+ y_pred_proba.tolist() if y_pred_proba is not None else None
2834
+ ),
2835
+ "features":features,
2836
+ "y_train": y_train if y_train is not None else [],
2837
+ "y_true": y_true if y_true is not None else [],
2838
+ # "coef":coef_,
2839
+ # "alphas":alphas_
2546
2840
  }
2841
+ df_results = pd.DataFrame.from_dict(results, orient="index")
2842
+ gs['res']=df_results
2843
+
2844
+ if all([plot_, y_true is not None, purpose == "classification"]):
2845
+ from datetime import datetime
2846
+
2847
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
2848
+ # try:
2849
+ if df_results.shape[0] > 3:
2850
+ try:
2851
+ plot_validate_features(df_results, is_binary=is_binary)
2852
+ except Exception as e:
2853
+ print(e)
2854
+ else:
2855
+ try:
2856
+ plot_validate_features_single(df_results, is_binary=is_binary)
2857
+ except Exception as e:
2858
+ print(e)
2859
+ if dir_save:
2860
+ ips.figsave(dir_save + f"validate_features{now_}.pdf")
2861
+ return gs
2862
+
2863
+ #! cross_valid
2547
2864
  if cv_level in ["low", "simple", "s", "l"]:
2548
2865
  param_grids = {
2549
2866
  "Random Forest": (
@@ -2712,7 +3029,73 @@ def predict(
2712
3029
  'alpha': [0.1],
2713
3030
  'max_iter': [100],},
2714
3031
  "Poisson":{'alpha': [0.1],
2715
- 'max_iter': [100],}
3032
+ 'max_iter': [100],},
3033
+ "Lars": {"n_nonzero_coefs": [10, 50, None]},
3034
+ "LassoLars": {
3035
+ "alpha": [0.01, 0.1, 1]
3036
+ },
3037
+ "BayesianRidge": {
3038
+ "alpha_1": [1e-6, 1e-4, 1e-2],
3039
+ "lambda_1": [1e-6, 1e-4, 1e-2]
3040
+ },
3041
+ "GammaRegressor": {
3042
+ "alpha": [0.1, 1, 10]
3043
+ },
3044
+ "TweedieRegressor": {
3045
+ "alpha": [0.1, 1, 10],
3046
+ "power": [1, 1.5, 2]
3047
+ },
3048
+ "LassoCV": {
3049
+ "cv": [5]
3050
+ },
3051
+ "ElasticNetCV": {
3052
+ "l1_ratio": [0.2, 0.5, 0.8],
3053
+ "cv": [5]
3054
+ },
3055
+ "LassoLarsCV": {
3056
+ "cv": [5]
3057
+ },
3058
+ "LarsCV": {
3059
+ "cv": [5]
3060
+ },
3061
+ "OrthogonalMatchingPursuit": {
3062
+ "n_nonzero_coefs": [10, 50, None]
3063
+ },
3064
+ "OrthogonalMatchingPursuitCV": {
3065
+ "cv": [5]
3066
+ },
3067
+ "PassiveAggressiveRegressor": {
3068
+ "C": [0.1, 1, 10]
3069
+ },
3070
+ "LinearSVR": {
3071
+ "C": [0.1, 1, 10]
3072
+ },
3073
+ "NuSVR": {
3074
+ "C": [0.1, 1, 10]
3075
+ },
3076
+ "DecisionTreeRegressor": {
3077
+ "max_depth": [5, 10, None]
3078
+ },
3079
+ "ExtraTreeRegressor": {
3080
+ "max_depth": [5, 10, None]
3081
+ },
3082
+ "HistGradientBoostingRegressor": {
3083
+ "learning_rate": [0.05, 0.1, 0.2],
3084
+ "max_depth": [5, 10, None]
3085
+ },
3086
+ "GaussianProcessRegressor": {
3087
+ "alpha": [1e-5, 1e-2, 0.1]
3088
+ },
3089
+ "KernelRidge": {
3090
+ "alpha": [0.1, 1, 10],
3091
+ "kernel": ["linear", "rbf"]
3092
+ },
3093
+ "DummyRegressor": {
3094
+ "strategy": ["mean", "median"]
3095
+ },
3096
+ "TransformedTargetRegressor": {
3097
+ "regressor__fit_intercept": [True, False]
3098
+ }
2716
3099
  }
2717
3100
  elif cv_level in ["high", "advanced", "h"]:
2718
3101
  param_grids = {
@@ -2917,7 +3300,96 @@ def predict(
2917
3300
  'alpha': [0.1, 1.0, 10.0],
2918
3301
  'max_iter': [100, 200, 300],},
2919
3302
  "Poisson":{'alpha': [0.1, 1.0, 10.0],
2920
- 'max_iter': [100, 200, 300],}
3303
+ 'max_iter': [100, 200, 300],},
3304
+ "Lars": {
3305
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3306
+ },
3307
+ "LassoLars": {
3308
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3309
+ },
3310
+ "BayesianRidge": {
3311
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3312
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3313
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3314
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3315
+ },
3316
+ "GammaRegressor": {
3317
+ "alpha": [0.01, 0.1, 1, 10],
3318
+ "max_iter": [1000, 5000, 10000]
3319
+ },
3320
+ "TweedieRegressor": {
3321
+ "alpha": [0.01, 0.1, 1, 10],
3322
+ "power": [0, 1, 1.5, 2, 3]
3323
+ },
3324
+ "LassoCV": {
3325
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3326
+ "cv": [3, 5, 10]
3327
+ },
3328
+ "ElasticNetCV": {
3329
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3330
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3331
+ "cv": [3, 5, 10]
3332
+ },
3333
+ "LassoLarsCV": {
3334
+ "cv": [3, 5, 10]
3335
+ },
3336
+ "LarsCV": {
3337
+ "cv": [3, 5, 10]
3338
+ },
3339
+ "OrthogonalMatchingPursuit": {
3340
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3341
+ },
3342
+ "OrthogonalMatchingPursuitCV": {
3343
+ "cv": [3, 5, 10]
3344
+ },
3345
+ "PassiveAggressiveRegressor": {
3346
+ "C": [0.01, 0.1, 1, 10],
3347
+ "max_iter": [1000, 5000, 10000],
3348
+ "early_stopping": [True, False]
3349
+ },
3350
+ "LinearSVR": {
3351
+ "C": [0.01, 0.1, 1, 10],
3352
+ "epsilon": [0.01, 0.1, 1],
3353
+ "max_iter": [1000, 5000, 10000]
3354
+ },
3355
+ "NuSVR": {
3356
+ "C": [0.01, 0.1, 1, 10],
3357
+ "nu": [0.25, 0.5, 0.75],
3358
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3359
+ },
3360
+ "DecisionTreeRegressor": {
3361
+ "max_depth": [None, 5, 10, 20],
3362
+ "min_samples_split": [2, 5, 10],
3363
+ "min_samples_leaf": [1, 2, 4]
3364
+ },
3365
+ "ExtraTreeRegressor": {
3366
+ "max_depth": [None, 5, 10, 20],
3367
+ "min_samples_split": [2, 5, 10],
3368
+ "min_samples_leaf": [1, 2, 4]
3369
+ },
3370
+ "HistGradientBoostingRegressor": {
3371
+ "learning_rate": [0.01, 0.1, 0.2],
3372
+ "max_iter": [100, 500, 1000],
3373
+ "max_depth": [None, 5, 10, 20],
3374
+ "min_samples_leaf": [1, 2, 4]
3375
+ },
3376
+ "GaussianProcessRegressor": {
3377
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3378
+ "n_restarts_optimizer": [0, 1, 5, 10]
3379
+ },
3380
+ "KernelRidge": {
3381
+ "alpha": [0.01, 0.1, 1, 10],
3382
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3383
+ "degree": [2, 3, 4]
3384
+ },
3385
+ "DummyRegressor": {
3386
+ "strategy": ["mean", "median", "constant"],
3387
+ "constant": [0] # Only if strategy is 'constant'
3388
+ },
3389
+ "TransformedTargetRegressor": {
3390
+ # Grid for the underlying regressor, example shown for LinearRegression
3391
+ "regressor__fit_intercept": [True, False]
3392
+ }
2921
3393
  }
2922
3394
  else: # median level
2923
3395
  param_grids = {
@@ -3164,7 +3636,96 @@ def predict(
3164
3636
  'alpha': [0.1, 1.0],
3165
3637
  'max_iter': [100, 200],},
3166
3638
  "Poisson":{'alpha': [0.1, 1.0],
3167
- 'max_iter': [100, 200],}
3639
+ 'max_iter': [100, 200],},
3640
+ "Lars": {
3641
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3642
+ },
3643
+ "LassoLars": {
3644
+ "alpha": [0.001, 0.01, 0.1, 1, 10]
3645
+ },
3646
+ "BayesianRidge": {
3647
+ "alpha_1": [1e-6, 1e-5, 1e-4],
3648
+ "alpha_2": [1e-6, 1e-5, 1e-4],
3649
+ "lambda_1": [1e-6, 1e-5, 1e-4],
3650
+ "lambda_2": [1e-6, 1e-5, 1e-4]
3651
+ },
3652
+ "GammaRegressor": {
3653
+ "alpha": [0.01, 0.1, 1, 10],
3654
+ "max_iter": [1000, 5000, 10000]
3655
+ },
3656
+ "TweedieRegressor": {
3657
+ "alpha": [0.01, 0.1, 1, 10],
3658
+ "power": [0, 1, 1.5, 2, 3]
3659
+ },
3660
+ "LassoCV": {
3661
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3662
+ "cv": [3, 5, 10]
3663
+ },
3664
+ "ElasticNetCV": {
3665
+ "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
3666
+ "alphas": [[0.001, 0.01, 0.1, 1, 10]],
3667
+ "cv": [3, 5, 10]
3668
+ },
3669
+ "LassoLarsCV": {
3670
+ "cv": [3, 5, 10]
3671
+ },
3672
+ "LarsCV": {
3673
+ "cv": [3, 5, 10]
3674
+ },
3675
+ "OrthogonalMatchingPursuit": {
3676
+ "n_nonzero_coefs": [10, 50, 100, 200, None]
3677
+ },
3678
+ "OrthogonalMatchingPursuitCV": {
3679
+ "cv": [3, 5, 10]
3680
+ },
3681
+ "PassiveAggressiveRegressor": {
3682
+ "C": [0.01, 0.1, 1, 10],
3683
+ "max_iter": [1000, 5000, 10000],
3684
+ "early_stopping": [True, False]
3685
+ },
3686
+ "LinearSVR": {
3687
+ "C": [0.01, 0.1, 1, 10],
3688
+ "epsilon": [0.01, 0.1, 1],
3689
+ "max_iter": [1000, 5000, 10000]
3690
+ },
3691
+ "NuSVR": {
3692
+ "C": [0.01, 0.1, 1, 10],
3693
+ "nu": [0.25, 0.5, 0.75],
3694
+ "kernel": ["linear", "poly", "rbf", "sigmoid"]
3695
+ },
3696
+ "DecisionTreeRegressor": {
3697
+ "max_depth": [None, 5, 10, 20],
3698
+ "min_samples_split": [2, 5, 10],
3699
+ "min_samples_leaf": [1, 2, 4]
3700
+ },
3701
+ "ExtraTreeRegressor": {
3702
+ "max_depth": [None, 5, 10, 20],
3703
+ "min_samples_split": [2, 5, 10],
3704
+ "min_samples_leaf": [1, 2, 4]
3705
+ },
3706
+ "HistGradientBoostingRegressor": {
3707
+ "learning_rate": [0.01, 0.1, 0.2],
3708
+ "max_iter": [100, 500, 1000],
3709
+ "max_depth": [None, 5, 10, 20],
3710
+ "min_samples_leaf": [1, 2, 4]
3711
+ },
3712
+ "GaussianProcessRegressor": {
3713
+ "alpha": [1e-10, 1e-5, 1e-2, 0.1],
3714
+ "n_restarts_optimizer": [0, 1, 5, 10]
3715
+ },
3716
+ "KernelRidge": {
3717
+ "alpha": [0.01, 0.1, 1, 10],
3718
+ "kernel": ["linear", "poly", "rbf", "sigmoid"],
3719
+ "degree": [2, 3, 4]
3720
+ },
3721
+ "DummyRegressor": {
3722
+ "strategy": ["mean", "median", "constant"],
3723
+ "constant": [0] # Only if strategy is 'constant'
3724
+ },
3725
+ "TransformedTargetRegressor": {
3726
+ # Grid for the underlying regressor, example shown for LinearRegression
3727
+ "regressor__fit_intercept": [True, False]
3728
+ }
3168
3729
  }
3169
3730
 
3170
3731
  results = {}
@@ -3174,7 +3735,7 @@ def predict(
3174
3735
  if purpose == "classification"
3175
3736
  else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
3176
3737
  )
3177
-
3738
+
3178
3739
  # Train and validate each model
3179
3740
  for name, clf in tqdm(
3180
3741
  models.items(),
@@ -3185,8 +3746,7 @@ def predict(
3185
3746
  if verbose:
3186
3747
  print(f"\nTraining and validating {name}:")
3187
3748
  try:
3188
- # Grid search with KFold or StratifiedKFold
3189
- if is_binary:
3749
+ if is_binary:
3190
3750
  gs = GridSearchCV(
3191
3751
  clf,
3192
3752
  param_grid=param_grids.get(name, {}),
@@ -3202,6 +3762,7 @@ def predict(
3202
3762
 
3203
3763
  gs.fit(x_train, y_train)
3204
3764
  best_clf = gs.best_estimator_
3765
+
3205
3766
  # make sure x_train and x_test has the same name
3206
3767
  x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
3207
3768
  y_pred = best_clf.predict(x_true)
@@ -3212,7 +3773,14 @@ def predict(
3212
3773
  y_pred_proba = np.hstack(
3213
3774
  [1 - y_pred_proba, y_pred_proba]
3214
3775
  ) # Add missing class probabilities
3215
- y_pred_proba = y_pred_proba[:, 1]
3776
+ if y_pred_proba.shape[1] == 2:
3777
+ if isinstance(y_pred_proba, pd.DataFrame):
3778
+ y_pred_proba = y_pred_proba.iloc[:, 1]
3779
+ elif isinstance(y_pred_proba, pd.Series):
3780
+ y_pred_proba = y_pred_proba.values[:, 1]
3781
+ else:
3782
+ y_pred_proba = y_pred_proba[:, 1]
3783
+
3216
3784
  elif hasattr(best_clf, "decision_function"):
3217
3785
  # If predict_proba is not available, use decision_function (e.g., for SVM)
3218
3786
  y_pred_proba = best_clf.decision_function(x_true)
@@ -3233,7 +3801,7 @@ def predict(
3233
3801
  else:
3234
3802
  alphas_= None
3235
3803
  coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
3236
- else:
3804
+ else:
3237
3805
  gs = GridSearchCV(
3238
3806
  clf,
3239
3807
  param_grid=param_grids.get(name, {}),
@@ -3289,6 +3857,7 @@ def predict(
3289
3857
  alphas_,coef_ = None,None
3290
3858
  print(f"skiped {clf}: {e}")
3291
3859
  continue
3860
+
3292
3861
  # try to make predict format consistant
3293
3862
  try:
3294
3863
  y_pred= [i[0] for i in y_pred]
@@ -3460,7 +4029,8 @@ def predict(
3460
4029
 
3461
4030
  # Convert results to DataFrame
3462
4031
  df_results = pd.DataFrame.from_dict(results, orient="index")
3463
- # sort
4032
+ display(df_results)
4033
+ # sort
3464
4034
  if y_true is not None:
3465
4035
  if purpose == "classification":
3466
4036
  df_scores = pd.DataFrame(
@@ -4028,6 +4598,8 @@ def predict(
4028
4598
  df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
4029
4599
  elif stack:
4030
4600
  df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
4601
+ else:
4602
+ df_res=df_results
4031
4603
 
4032
4604
  if all([plot_, y_true is not None, purpose == "classification"]):
4033
4605
  from datetime import datetime