py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +496 -138
- py2ls/ml2ls.py +994 -288
- py2ls/netfinder.py +16 -20
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +1244 -158
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +17 -14
- py2ls/data/usages_pd copy.json +0 -1105
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
|
|
31
31
|
average_precision_score,
|
32
32
|
)
|
33
33
|
from typing import Dict, Any, Optional, List, Union
|
34
|
+
import os, json
|
34
35
|
import numpy as np
|
35
36
|
import pandas as pd
|
36
37
|
from . import ips
|
@@ -49,7 +50,13 @@ logger = logging.getLogger()
|
|
49
50
|
warnings.filterwarnings("ignore", category=UserWarning)
|
50
51
|
from sklearn.tree import DecisionTreeClassifier
|
51
52
|
from sklearn.neighbors import KNeighborsClassifier
|
52
|
-
|
53
|
+
#* set random_state global
|
54
|
+
import torch
|
55
|
+
import random
|
56
|
+
random_state=1
|
57
|
+
random.seed(random_state)
|
58
|
+
np.random.seed(random_state)
|
59
|
+
torch.manual_seed(random_state)
|
53
60
|
|
54
61
|
def features_knn(
|
55
62
|
x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
|
@@ -594,7 +601,7 @@ def get_features(
|
|
594
601
|
"""
|
595
602
|
from sklearn.compose import ColumnTransformer
|
596
603
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
597
|
-
|
604
|
+
from sklearn.model_selection import train_test_split
|
598
605
|
# Ensure X and y are DataFrames/Series for consistency
|
599
606
|
if isinstance(X, np.ndarray):
|
600
607
|
X = pd.DataFrame(X)
|
@@ -922,10 +929,26 @@ def get_features(
|
|
922
929
|
"feature_importances": feature_importances,
|
923
930
|
}
|
924
931
|
if all([plot_, dir_save]):
|
932
|
+
|
925
933
|
from datetime import datetime
|
926
|
-
|
927
934
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
928
935
|
ips.figsave(dir_save + f"features{now_}.pdf")
|
936
|
+
|
937
|
+
lists = []
|
938
|
+
for tp in ips.flatten(features_df["type"]):
|
939
|
+
lists.append(
|
940
|
+
features_df
|
941
|
+
.loc[features_df["type"] == tp, "feature"]
|
942
|
+
.tolist()
|
943
|
+
)
|
944
|
+
labels = ips.flatten(features_df["type"])
|
945
|
+
# current_fig = plt.gcf()
|
946
|
+
# # ax = current_fig.add_subplot(3, 2, 6)
|
947
|
+
# gs = current_fig.add_gridspec(3, 2)
|
948
|
+
# ax = current_fig.add_subplot(gs[:, :])
|
949
|
+
plt.figure(figsize=[6,6])
|
950
|
+
plot.venn(lists, labels, cmap="coolwarm")
|
951
|
+
ips.figsave(dir_save + f"features{now_}shared_features.pdf")
|
929
952
|
else:
|
930
953
|
results = {
|
931
954
|
"selected_features": pd.DataFrame(),
|
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1247
1270
|
nexttile = plot.subplot(figsize=figsize)
|
1248
1271
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1249
1272
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1273
|
+
try:
|
1274
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1275
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1276
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1277
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1278
|
+
plot_roc_curve(
|
1279
|
+
fpr,
|
1280
|
+
tpr,
|
1281
|
+
mean_auc,
|
1282
|
+
lower_ci,
|
1283
|
+
upper_ci,
|
1284
|
+
model_name=model_name,
|
1285
|
+
lw=1.5,
|
1286
|
+
color=colors[i],
|
1287
|
+
alpha=alpha,
|
1288
|
+
ax=ax,
|
1289
|
+
)
|
1290
|
+
except Exception as e:
|
1291
|
+
print(e)
|
1266
1292
|
plot.figsets(
|
1267
1293
|
sp=2,
|
1268
1294
|
legend=dict(
|
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1277
1303
|
|
1278
1304
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1279
1305
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1306
|
+
try:
|
1307
|
+
plot_pr_curve(
|
1308
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1309
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1310
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1311
|
+
model_name=model_name,
|
1312
|
+
color=colors[i],
|
1313
|
+
lw=1.5,
|
1314
|
+
alpha=alpha,
|
1315
|
+
ax=ax,
|
1316
|
+
)
|
1317
|
+
except Exception as e:
|
1318
|
+
print(e)
|
1290
1319
|
plot.figsets(
|
1291
1320
|
sp=2,
|
1292
1321
|
legend=dict(
|
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1314
1343
|
for iclass, class_ in enumerate(classes):
|
1315
1344
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1316
1345
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1346
|
+
try:
|
1347
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1348
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1349
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1350
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1351
|
+
plot_roc_curve(
|
1352
|
+
fpr,
|
1353
|
+
tpr,
|
1354
|
+
mean_auc,
|
1355
|
+
lower_ci,
|
1356
|
+
upper_ci,
|
1357
|
+
model_name=model_name,
|
1358
|
+
lw=1.5,
|
1359
|
+
color=colors[i],
|
1360
|
+
alpha=alpha,
|
1361
|
+
ax=ax,
|
1362
|
+
)
|
1363
|
+
except Exception as e:
|
1364
|
+
print(e)
|
1333
1365
|
plot.figsets(
|
1334
1366
|
sp=2,
|
1335
1367
|
title=class_,
|
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1345
1377
|
|
1346
1378
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1347
1379
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1380
|
+
try:
|
1381
|
+
plot_pr_curve(
|
1382
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1383
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1384
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1385
|
+
iclass
|
1386
|
+
],
|
1387
|
+
model_name=model_name,
|
1388
|
+
color=colors[i],
|
1389
|
+
lw=1.5,
|
1390
|
+
alpha=alpha,
|
1391
|
+
ax=ax,
|
1392
|
+
)
|
1393
|
+
except Exception as e:
|
1394
|
+
print(e)
|
1360
1395
|
plot.figsets(
|
1361
1396
|
sp=2,
|
1362
1397
|
title=class_,
|
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1379
1414
|
len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
|
1380
1415
|
)
|
1381
1416
|
for model_name in ips.flatten(res_val["pr_curve"].index):
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
# Plotting
|
1388
|
-
plot_roc_curve(
|
1389
|
-
fpr,
|
1390
|
-
tpr,
|
1391
|
-
mean_auc,
|
1392
|
-
lower_ci,
|
1393
|
-
upper_ci,
|
1394
|
-
model_name=model_name,
|
1395
|
-
ax=nexttile(),
|
1396
|
-
)
|
1397
|
-
plot.figsets(title=model_name, sp=2)
|
1417
|
+
try:
|
1418
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1419
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1420
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1421
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1398
1422
|
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1423
|
+
# Plotting
|
1424
|
+
plot_roc_curve(
|
1425
|
+
fpr,
|
1426
|
+
tpr,
|
1427
|
+
mean_auc,
|
1428
|
+
lower_ci,
|
1429
|
+
upper_ci,
|
1430
|
+
model_name=model_name,
|
1431
|
+
ax=nexttile(),
|
1432
|
+
)
|
1433
|
+
plot.figsets(title=model_name, sp=2)
|
1407
1434
|
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1435
|
+
plot_pr_binary(
|
1436
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1437
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1438
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1439
|
+
model_name=model_name,
|
1440
|
+
ax=nexttile(),
|
1441
|
+
)
|
1442
|
+
plot.figsets(title=model_name, sp=2)
|
1443
|
+
|
1444
|
+
# plot cm
|
1445
|
+
plot_cm(
|
1446
|
+
res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
|
1447
|
+
)
|
1448
|
+
plot.figsets(title=model_name, sp=2)
|
1449
|
+
|
1450
|
+
except Exception as e:
|
1451
|
+
print(e)
|
1413
1452
|
else:
|
1414
1453
|
|
1415
1454
|
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1424
1463
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1425
1464
|
ax = nexttile()
|
1426
1465
|
for iclass, class_ in enumerate(classes):
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1466
|
+
try:
|
1467
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1468
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1469
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1470
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1471
|
+
plot_roc_curve(
|
1472
|
+
fpr,
|
1473
|
+
tpr,
|
1474
|
+
mean_auc,
|
1475
|
+
lower_ci,
|
1476
|
+
upper_ci,
|
1477
|
+
model_name=class_,
|
1478
|
+
lw=1.5,
|
1479
|
+
color=colors[iclass],
|
1480
|
+
alpha=0.03,
|
1481
|
+
ax=ax,
|
1482
|
+
)
|
1483
|
+
except Exception as e:
|
1484
|
+
print(e)
|
1443
1485
|
plot.figsets(
|
1444
1486
|
sp=2,
|
1445
1487
|
title=model_name,
|
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1451
1493
|
|
1452
1494
|
ax = nexttile()
|
1453
1495
|
for iclass, class_ in enumerate(classes):
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1496
|
+
try:
|
1497
|
+
plot_pr_curve(
|
1498
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1499
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1500
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1501
|
+
iclass
|
1502
|
+
],
|
1503
|
+
model_name=class_,
|
1504
|
+
color=colors[iclass],
|
1505
|
+
lw=1.5,
|
1506
|
+
alpha=0.03,
|
1507
|
+
ax=ax,
|
1508
|
+
)
|
1509
|
+
except Exception as e:
|
1510
|
+
print(e)
|
1466
1511
|
plot.figsets(
|
1467
1512
|
sp=2,
|
1468
1513
|
title=class_,
|
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
|
|
1543
1588
|
# print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
|
1544
1589
|
sorted_scores = np.array(bootstrapped_scores)
|
1545
1590
|
sorted_scores.sort()
|
1546
|
-
|
1547
|
-
# Computing the lower and upper bound of the 90% confidence interval
|
1548
|
-
# You can change the bounds percentiles to 0.025 and 0.975 to get
|
1549
|
-
# a 95% confidence interval instead.
|
1591
|
+
|
1550
1592
|
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1551
1593
|
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1552
1594
|
if verbose:
|
1553
1595
|
print(
|
1554
|
-
"Confidence interval for the score: [{:0.3f} - {:0.
|
1596
|
+
"Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
|
1555
1597
|
confidence_lower, confidence_upper
|
1556
1598
|
)
|
1557
1599
|
)
|
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
|
|
1568
1610
|
y_true, classes=np.unique(y_true)
|
1569
1611
|
) # One-vs-Rest transformation
|
1570
1612
|
n_classes = y_true_bin.shape[1] # Number of classes
|
1571
|
-
|
1572
|
-
bootstrapped_scores = np.
|
1573
|
-
(n_classes, n_bootstraps)
|
1574
|
-
) # Store scores for each class
|
1575
|
-
|
1613
|
+
|
1614
|
+
bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
|
1576
1615
|
if verbose:
|
1577
1616
|
print("AUROC scores for each class:")
|
1578
1617
|
for i in range(n_classes):
|
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
|
|
1592
1631
|
# Calculating the confidence intervals for each class
|
1593
1632
|
confidence_intervals = []
|
1594
1633
|
for class_idx in range(n_classes):
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
)
|
1634
|
+
# rm nan
|
1635
|
+
valid_scores = bootstrapped_scores[class_idx][
|
1636
|
+
~np.isnan(bootstrapped_scores[class_idx])
|
1637
|
+
]
|
1638
|
+
if len(valid_scores) > 0:
|
1639
|
+
sorted_scores = np.sort(valid_scores)
|
1640
|
+
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1641
|
+
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1642
|
+
confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
|
1643
|
+
|
1644
|
+
if verbose:
|
1645
|
+
print(
|
1646
|
+
f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
|
1647
|
+
)
|
1648
|
+
else:
|
1649
|
+
confidence_intervals[class_idx] = (np.nan, np.nan)
|
1650
|
+
if verbose:
|
1651
|
+
print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
|
1604
1652
|
|
1605
1653
|
return confidence_intervals
|
1606
1654
|
|
@@ -2057,20 +2105,20 @@ def rank_models(
|
|
2057
2105
|
|
2058
2106
|
def generate_bar_plot(ax, cv_test_scores):
|
2059
2107
|
ax = plot.plotxy(
|
2060
|
-
y="Classifier", x="combined_score", data=cv_test_scores,
|
2108
|
+
y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
|
2061
2109
|
)
|
2062
2110
|
plt.title("Classifier Performance")
|
2063
2111
|
plt.tight_layout()
|
2064
2112
|
return plt
|
2065
2113
|
|
2066
|
-
nexttile = plot.subplot(2, 2, figsize=[10,
|
2114
|
+
nexttile = plot.subplot(2, 2, figsize=[10, 10])
|
2067
2115
|
generate_bar_plot(nexttile(), top_models.dropna())
|
2068
2116
|
plot.radar(
|
2069
2117
|
ax=nexttile(projection="polar"),
|
2070
2118
|
data=cv_test_scores.set_index("Classifier"),
|
2071
|
-
ylim=[0
|
2072
|
-
color=plot.get_color(
|
2073
|
-
alpha=0.
|
2119
|
+
ylim=[0, 1],
|
2120
|
+
color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
|
2121
|
+
alpha=0.02,
|
2074
2122
|
circular=1,
|
2075
2123
|
)
|
2076
2124
|
return cv_test_scores
|
@@ -2206,6 +2254,8 @@ def predict(
|
|
2206
2254
|
y_train: pd.Series,
|
2207
2255
|
x_true: pd.DataFrame = None,
|
2208
2256
|
y_true: Optional[pd.Series] = None,
|
2257
|
+
fill_missing:bool = True,
|
2258
|
+
scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
|
2209
2259
|
backward: bool = False, # backward_regression
|
2210
2260
|
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2211
2261
|
common_features: set = None,
|
@@ -2214,7 +2264,7 @@ def predict(
|
|
2214
2264
|
metrics: Optional[List[str]] = None,
|
2215
2265
|
stack:bool=True,# run stacking
|
2216
2266
|
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2217
|
-
vote:bool=
|
2267
|
+
vote:bool=False,# run voting
|
2218
2268
|
voting:str="hard", # only for classification purporse of voting
|
2219
2269
|
n_top_models:int=5, #for stacking models
|
2220
2270
|
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
@@ -2227,7 +2277,12 @@ def predict(
|
|
2227
2277
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2228
2278
|
class_weight: str = "balanced",
|
2229
2279
|
random_state: int = 1,
|
2280
|
+
presets = "best_quality",# specific for autogluon
|
2281
|
+
time_limit=600, # specific for autogluon
|
2282
|
+
num_bag_folds=5, # specific for autogluon
|
2283
|
+
num_stack_levels=2, # specific for autogluon
|
2230
2284
|
verbose: bool = False,
|
2285
|
+
**kwargs
|
2231
2286
|
) -> pd.DataFrame:
|
2232
2287
|
"""
|
2233
2288
|
第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
|
@@ -2278,28 +2333,20 @@ def predict(
|
|
2278
2333
|
RandomForestRegressor,
|
2279
2334
|
ExtraTreesClassifier,
|
2280
2335
|
ExtraTreesRegressor,
|
2336
|
+
HistGradientBoostingRegressor,
|
2281
2337
|
BaggingClassifier,
|
2282
2338
|
BaggingRegressor,
|
2283
2339
|
AdaBoostClassifier,
|
2284
2340
|
AdaBoostRegressor,
|
2285
2341
|
)
|
2286
|
-
from sklearn.svm import SVC, SVR
|
2287
|
-
from sklearn.tree import DecisionTreeRegressor
|
2342
|
+
from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
|
2343
|
+
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
|
2288
2344
|
from sklearn.linear_model import (
|
2289
|
-
LogisticRegression,
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
RidgeClassifierCV,
|
2295
|
-
Perceptron,
|
2296
|
-
SGDClassifier,
|
2297
|
-
RidgeCV,
|
2298
|
-
Ridge,
|
2299
|
-
TheilSenRegressor,
|
2300
|
-
HuberRegressor,
|
2301
|
-
PoissonRegressor,
|
2302
|
-
|
2345
|
+
LogisticRegression,ElasticNet,ElasticNetCV,
|
2346
|
+
LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
|
2347
|
+
RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
|
2348
|
+
GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
|
2349
|
+
OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
|
2303
2350
|
)
|
2304
2351
|
from sklearn.compose import TransformedTargetRegressor
|
2305
2352
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
@@ -2316,15 +2363,21 @@ def predict(
|
|
2316
2363
|
)
|
2317
2364
|
from sklearn.preprocessing import PolynomialFeatures
|
2318
2365
|
from sklearn.model_selection import train_test_split
|
2319
|
-
|
2366
|
+
|
2367
|
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
2368
|
+
from sklearn.kernel_ridge import KernelRidge
|
2369
|
+
from sklearn.dummy import DummyRegressor
|
2370
|
+
from autogluon.tabular import TabularPredictor
|
2320
2371
|
# 拼写检查
|
2321
2372
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
2322
2373
|
print(f"{purpose} processing...")
|
2374
|
+
|
2375
|
+
|
2323
2376
|
# Default models or regressors if not provided
|
2324
2377
|
if purpose == "classification":
|
2325
2378
|
model_ = {
|
2326
2379
|
"Random Forest": RandomForestClassifier(
|
2327
|
-
random_state=random_state, class_weight=class_weight
|
2380
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2328
2381
|
),
|
2329
2382
|
# SVC (Support Vector Classification)
|
2330
2383
|
"SVM": SVC(
|
@@ -2335,7 +2388,7 @@ def predict(
|
|
2335
2388
|
),
|
2336
2389
|
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
2337
2390
|
"Logistic Regression": LogisticRegression(
|
2338
|
-
class_weight=class_weight, random_state=random_state
|
2391
|
+
class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
|
2339
2392
|
),
|
2340
2393
|
# Logistic Regression with L1 Regularization (Lasso)
|
2341
2394
|
"Lasso Logistic Regression": LogisticRegression(
|
@@ -2346,51 +2399,70 @@ def predict(
|
|
2346
2399
|
eval_metric="logloss",
|
2347
2400
|
random_state=random_state,
|
2348
2401
|
),
|
2349
|
-
"KNN": KNeighborsClassifier(n_neighbors=5),
|
2402
|
+
"KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
|
2350
2403
|
"Naive Bayes": GaussianNB(),
|
2351
2404
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
2352
2405
|
"AdaBoost": AdaBoostClassifier(
|
2353
2406
|
algorithm="SAMME", random_state=random_state
|
2354
2407
|
),
|
2355
|
-
|
2408
|
+
"LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
|
2356
2409
|
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
2357
2410
|
"Extra Trees": ExtraTreesClassifier(
|
2358
|
-
random_state=random_state, class_weight=class_weight
|
2411
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2359
2412
|
),
|
2360
|
-
"Bagging": BaggingClassifier(random_state=random_state),
|
2413
|
+
"Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
|
2361
2414
|
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
2362
2415
|
"DecisionTree": DecisionTreeClassifier(),
|
2363
2416
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
2364
2417
|
"Ridge": RidgeClassifierCV(
|
2365
2418
|
class_weight=class_weight, store_cv_results=True
|
2366
2419
|
),
|
2367
|
-
"Perceptron": Perceptron(random_state=random_state),
|
2420
|
+
"Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
|
2368
2421
|
"Bernoulli Naive Bayes": BernoulliNB(),
|
2369
|
-
"SGDClassifier": SGDClassifier(random_state=random_state),
|
2422
|
+
"SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
|
2370
2423
|
}
|
2371
2424
|
elif purpose == "regression":
|
2372
2425
|
model_ = {
|
2373
|
-
"Random Forest": RandomForestRegressor(random_state=random_state),
|
2426
|
+
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2374
2427
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2375
|
-
|
2376
|
-
"LassoCV": LassoCV(
|
2377
|
-
cv=cv_folds, random_state=random_state
|
2378
|
-
), # LassoCV自动找出最适alpha,优于Lasso
|
2428
|
+
"LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
|
2379
2429
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2380
|
-
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2381
|
-
"Linear Regression": LinearRegression(),
|
2430
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2431
|
+
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2382
2432
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2383
|
-
|
2433
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
|
2384
2434
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2385
|
-
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
2386
|
-
"Bagging": BaggingRegressor(random_state=random_state),
|
2435
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2436
|
+
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2387
2437
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2388
2438
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2389
|
-
"Ridge": Ridge(),
|
2390
|
-
"KNN": KNeighborsRegressor(),
|
2391
|
-
"TheilSen":TheilSenRegressor(),
|
2439
|
+
"Ridge": Ridge(random_state=random_state),
|
2440
|
+
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2441
|
+
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2392
2442
|
"Huber":HuberRegressor(),
|
2393
|
-
"Poisson":PoissonRegressor()
|
2443
|
+
"Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
|
2444
|
+
"Lasso": Lasso(random_state=random_state),
|
2445
|
+
"Lars": Lars(),
|
2446
|
+
"LassoLars": LassoLars(),
|
2447
|
+
"BayesianRidge": BayesianRidge(),
|
2448
|
+
"GammaRegressor": GammaRegressor(),
|
2449
|
+
"TweedieRegressor": TweedieRegressor(),
|
2450
|
+
"LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
|
2451
|
+
"ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
|
2452
|
+
"LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
|
2453
|
+
"LarsCV": LarsCV(),
|
2454
|
+
"OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
|
2455
|
+
"OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
|
2456
|
+
"PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
|
2457
|
+
"LinearSVR": LinearSVR(random_state=random_state),
|
2458
|
+
"NuSVR": NuSVR(),
|
2459
|
+
"DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
|
2460
|
+
"ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
|
2461
|
+
"HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
|
2462
|
+
"GaussianProcessRegressor": GaussianProcessRegressor(),
|
2463
|
+
"KernelRidge": KernelRidge(),
|
2464
|
+
"DummyRegressor": DummyRegressor(),
|
2465
|
+
"TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
|
2394
2466
|
}
|
2395
2467
|
if cls is None:
|
2396
2468
|
models = model_
|
@@ -2407,10 +2479,17 @@ def predict(
|
|
2407
2479
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2408
2480
|
)
|
2409
2481
|
|
2482
|
+
# only keep "autogluon_tab" in models
|
2483
|
+
cls = [cls] if isinstance(cls, str) else cls
|
2484
|
+
|
2485
|
+
if cls is not None:
|
2486
|
+
models={"autogluon_tab":None} if "auto" in cls else models
|
2487
|
+
|
2410
2488
|
# indicate cls:
|
2411
2489
|
if ips.run_once_within(30): # 10 min
|
2412
2490
|
print(f"processing: {list(models.keys())}")
|
2413
|
-
|
2491
|
+
y_train_col_name=None
|
2492
|
+
# print(isinstance(y_train, str) and y_train in x_train.columns)
|
2414
2493
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2415
2494
|
y_train_col_name = y_train
|
2416
2495
|
y_train = x_train[y_train]
|
@@ -2418,6 +2497,7 @@ def predict(
|
|
2418
2497
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2419
2498
|
# else:
|
2420
2499
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2500
|
+
|
2421
2501
|
y_train = pd.DataFrame(y_train)
|
2422
2502
|
if y_train.select_dtypes(include=np.number).empty:
|
2423
2503
|
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
@@ -2430,6 +2510,9 @@ def predict(
|
|
2430
2510
|
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2431
2511
|
print("is_binary:", is_binary)
|
2432
2512
|
|
2513
|
+
if fill_missing:
|
2514
|
+
ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
|
2515
|
+
ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
|
2433
2516
|
# Perform backward feature selection
|
2434
2517
|
if backward:
|
2435
2518
|
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
@@ -2458,6 +2541,8 @@ def predict(
|
|
2458
2541
|
pd.DataFrame(y_train), method="label"
|
2459
2542
|
).values.ravel()
|
2460
2543
|
|
2544
|
+
if fill_missing:
|
2545
|
+
ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
|
2461
2546
|
if y_true is not None:
|
2462
2547
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2463
2548
|
y_true_col_name = y_true
|
@@ -2490,11 +2575,16 @@ def predict(
|
|
2490
2575
|
# Ensure common features are selected
|
2491
2576
|
if common_features is not None:
|
2492
2577
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
2578
|
+
share_col_names=common_features
|
2493
2579
|
else:
|
2494
2580
|
share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
|
2495
2581
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2496
2582
|
|
2497
|
-
|
2583
|
+
#! scaler
|
2584
|
+
# scaler and fit x_train and export scaler to fit the x_true
|
2585
|
+
x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
|
2586
|
+
#
|
2587
|
+
x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
|
2498
2588
|
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2499
2589
|
x_true, method="dummy"
|
2500
2590
|
)
|
@@ -2516,18 +2606,261 @@ def predict(
|
|
2516
2606
|
if isinstance(y_train, np.ndarray):
|
2517
2607
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2518
2608
|
y_true = np.asarray(y_true)
|
2519
|
-
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2524
|
-
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2609
|
+
#! so far, got the: x_train,x_true,y_train,y_true
|
2610
|
+
# Grid search with KFold or StratifiedKFold
|
2611
|
+
if "autogluon_tab" in models:
|
2612
|
+
# load hypoer_param
|
2613
|
+
f_param = os.path.dirname(os.path.abspath(__file__))
|
2614
|
+
f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
|
2615
|
+
with open(f_param, "r") as file:
|
2616
|
+
hyper_param_autogluon = json.load(file)
|
2617
|
+
# Train the model with AutoGluon
|
2618
|
+
features=x_train.columns.tolist()
|
2619
|
+
label= y_train_col_name if y_train_col_name is not None else 'target'
|
2620
|
+
df_autogluon = x_train.copy()
|
2621
|
+
df_autogluon[label]=y_train
|
2622
|
+
autogluon_presets=["best_quality","good_quality","fast_train"]
|
2623
|
+
best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
|
2624
|
+
train_data=df_autogluon,
|
2625
|
+
presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
|
2626
|
+
time_limit=time_limit,#3600, # in sec: Limit training time,
|
2627
|
+
num_bag_folds=num_bag_folds,
|
2628
|
+
num_stack_levels=num_stack_levels,
|
2629
|
+
hyperparameters=hyper_param_autogluon,
|
2630
|
+
verbosity=1 if verbose else 0,
|
2631
|
+
**kwargs
|
2632
|
+
)
|
2633
|
+
#! Get the leaderboard
|
2634
|
+
gs={}
|
2635
|
+
# Display the leaderboard for reference
|
2636
|
+
leaderboard = best_clf.leaderboard()
|
2637
|
+
gs['info']=best_clf.info()
|
2638
|
+
# gs["res"]=best_clf
|
2639
|
+
gs["features"]=features
|
2640
|
+
gs["leaderboard"] = leaderboard
|
2641
|
+
best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
|
2642
|
+
# Store the best model and its details in the gs dictionary
|
2643
|
+
gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
|
2644
|
+
gs["best_params_"] = best_model_name # Hyperparameters
|
2645
|
+
# Make predictions if x_true is provided
|
2646
|
+
if x_true is not None:
|
2647
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2648
|
+
gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
|
2649
|
+
gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
|
2650
|
+
x_true[label]=gs["predictions"]
|
2651
|
+
if gs["predictions"].value_counts().shape[0]>1:
|
2652
|
+
gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
|
2653
|
+
gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
|
2654
|
+
all_models = gs["models"]
|
2655
|
+
model_evaluations = {}
|
2656
|
+
for model in all_models:
|
2657
|
+
predictions = best_clf.predict(x_true[features], model=model)
|
2658
|
+
evaluation = best_clf.evaluate_predictions(
|
2659
|
+
y_true=x_true[label], # True labels
|
2660
|
+
y_pred=predictions, # Predictions from the specific model
|
2661
|
+
auxiliary_metrics=True, # Include additional metrics if needed
|
2662
|
+
)
|
2663
|
+
model_evaluations[model] = evaluation
|
2664
|
+
gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
|
2665
|
+
#! 试着保持一样的格式
|
2666
|
+
results = {}
|
2667
|
+
for model in all_models:
|
2668
|
+
y_pred = best_clf.predict(x_true[features], model=model).tolist()
|
2669
|
+
y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
|
2670
|
+
|
2671
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
2672
|
+
y_pred_proba=y_pred_proba.iloc[:,1]
|
2673
|
+
|
2674
|
+
# try to make predict format consistant
|
2675
|
+
try:
|
2676
|
+
y_pred= [i[0] for i in y_pred]
|
2677
|
+
except:
|
2678
|
+
pass
|
2679
|
+
try:
|
2680
|
+
y_true= [i[0] for i in y_true]
|
2681
|
+
except:
|
2682
|
+
pass
|
2683
|
+
try:
|
2684
|
+
y_train= [i[0] for i in y_train]
|
2685
|
+
except:
|
2686
|
+
pass
|
2687
|
+
validation_scores = {}
|
2688
|
+
if y_true is not None and y_pred_proba is not None:
|
2689
|
+
validation_scores = cal_metrics(
|
2690
|
+
y_true,
|
2691
|
+
y_pred,
|
2692
|
+
y_pred_proba=y_pred_proba,
|
2693
|
+
is_binary=is_binary,
|
2694
|
+
purpose=purpose,
|
2695
|
+
average="weighted",
|
2696
|
+
)
|
2697
|
+
if is_binary:
|
2698
|
+
# Calculate ROC curve
|
2699
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
2700
|
+
if y_pred_proba is not None:
|
2701
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2702
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2703
|
+
lower_ci, upper_ci = cal_auc_ci(
|
2704
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2705
|
+
)
|
2706
|
+
roc_auc = auc(fpr, tpr)
|
2707
|
+
roc_info = {
|
2708
|
+
"fpr": fpr.tolist(),
|
2709
|
+
"tpr": tpr.tolist(),
|
2710
|
+
"auc": roc_auc,
|
2711
|
+
"ci95": (lower_ci, upper_ci),
|
2712
|
+
}
|
2713
|
+
# precision-recall curve
|
2714
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
|
2715
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
2716
|
+
pr_info = {
|
2717
|
+
"precision": precision_,
|
2718
|
+
"recall": recall_,
|
2719
|
+
"avg_precision": avg_precision_,
|
2720
|
+
}
|
2721
|
+
else:
|
2722
|
+
roc_info, pr_info = None, None
|
2723
|
+
if purpose == "classification":
|
2724
|
+
results[model] = {
|
2725
|
+
# "best_clf": gs.best_estimator_,
|
2726
|
+
# "best_params": gs.best_params_,
|
2727
|
+
# "auc_indiv": [
|
2728
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2729
|
+
# for i in range(cv_folds)
|
2730
|
+
# ],
|
2731
|
+
"scores": validation_scores,
|
2732
|
+
"roc_curve": roc_info,
|
2733
|
+
"pr_curve": pr_info,
|
2734
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2735
|
+
"predictions": y_pred,#.tolist(),
|
2736
|
+
"predictions_proba": (
|
2737
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2738
|
+
),
|
2739
|
+
"features":features,
|
2740
|
+
# "coef":coef_,
|
2741
|
+
# "alphas":alphas_
|
2742
|
+
}
|
2743
|
+
else: # "regression"
|
2744
|
+
results[model] = {
|
2745
|
+
# "best_clf": gs.best_estimator_,
|
2746
|
+
# "best_params": gs.best_params_,
|
2747
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2748
|
+
"predictions": y_pred,#.tolist(),
|
2749
|
+
"predictions_proba": (
|
2750
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2751
|
+
),
|
2752
|
+
"features":features,
|
2753
|
+
# "coef":coef_,
|
2754
|
+
# "alphas":alphas_
|
2755
|
+
}
|
2756
|
+
else: # multi-classes
|
2757
|
+
if y_pred_proba is not None:
|
2758
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2759
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2760
|
+
confidence_intervals = cal_auc_ci(
|
2761
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2762
|
+
)
|
2763
|
+
roc_info = {
|
2764
|
+
"fpr": validation_scores["fpr"],
|
2765
|
+
"tpr": validation_scores["tpr"],
|
2766
|
+
"auc": validation_scores["roc_auc_by_class"],
|
2767
|
+
"ci95": confidence_intervals,
|
2768
|
+
}
|
2769
|
+
# precision-recall curve
|
2770
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
2771
|
+
y_true, y_pred_proba, is_binary=is_binary
|
2772
|
+
)
|
2773
|
+
pr_info = {
|
2774
|
+
"precision": precision_,
|
2775
|
+
"recall": recall_,
|
2776
|
+
"avg_precision": avg_precision_,
|
2777
|
+
}
|
2778
|
+
else:
|
2779
|
+
roc_info, pr_info = None, None
|
2780
|
+
|
2781
|
+
if purpose == "classification":
|
2782
|
+
results[model] = {
|
2783
|
+
# "best_clf": gs.best_estimator_,
|
2784
|
+
# "best_params": gs.best_params_,
|
2785
|
+
# "auc_indiv": [
|
2786
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2787
|
+
# for i in range(cv_folds)
|
2788
|
+
# ],
|
2789
|
+
"scores": validation_scores,
|
2790
|
+
"roc_curve": roc_info,
|
2791
|
+
"pr_curve": pr_info,
|
2792
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2793
|
+
"predictions": y_pred,#.tolist(),
|
2794
|
+
"predictions_proba": (
|
2795
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2796
|
+
),
|
2797
|
+
"features":features,
|
2798
|
+
# "coef":coef_,
|
2799
|
+
# "alphas":alphas_
|
2800
|
+
}
|
2801
|
+
else: # "regression"
|
2802
|
+
results[model] = {
|
2803
|
+
# "best_clf": gs.best_estimator_,
|
2804
|
+
# "best_params": gs.best_params_,
|
2805
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2806
|
+
"predictions": y_pred,#.tolist(),
|
2807
|
+
"predictions_proba": (
|
2808
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2809
|
+
),
|
2810
|
+
"features":features,
|
2811
|
+
# "coef":coef_,
|
2812
|
+
# "alphas":alphas_
|
2813
|
+
}
|
2814
|
+
|
2815
|
+
else:
|
2816
|
+
if y_true is None:
|
2817
|
+
validation_scores = []
|
2818
|
+
else:
|
2819
|
+
validation_scores = cal_metrics(
|
2820
|
+
y_true,
|
2821
|
+
y_pred,
|
2822
|
+
y_pred_proba=y_pred_proba,
|
2823
|
+
is_binary=is_binary,
|
2824
|
+
purpose=purpose,
|
2825
|
+
average="weighted",
|
2826
|
+
)
|
2827
|
+
results[model] = {
|
2828
|
+
# "best_clf": gs.best_estimator_,
|
2829
|
+
# "best_params": gs.best_params_,
|
2830
|
+
"scores": validation_scores,
|
2831
|
+
"predictions": y_pred,#.tolist(),
|
2832
|
+
"predictions_proba": (
|
2833
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2834
|
+
),
|
2835
|
+
"features":features,
|
2836
|
+
"y_train": y_train if y_train is not None else [],
|
2837
|
+
"y_true": y_true if y_true is not None else [],
|
2838
|
+
# "coef":coef_,
|
2839
|
+
# "alphas":alphas_
|
2530
2840
|
}
|
2841
|
+
df_results = pd.DataFrame.from_dict(results, orient="index")
|
2842
|
+
gs['res']=df_results
|
2843
|
+
|
2844
|
+
if all([plot_, y_true is not None, purpose == "classification"]):
|
2845
|
+
from datetime import datetime
|
2846
|
+
|
2847
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
2848
|
+
# try:
|
2849
|
+
if df_results.shape[0] > 3:
|
2850
|
+
try:
|
2851
|
+
plot_validate_features(df_results, is_binary=is_binary)
|
2852
|
+
except Exception as e:
|
2853
|
+
print(e)
|
2854
|
+
else:
|
2855
|
+
try:
|
2856
|
+
plot_validate_features_single(df_results, is_binary=is_binary)
|
2857
|
+
except Exception as e:
|
2858
|
+
print(e)
|
2859
|
+
if dir_save:
|
2860
|
+
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
2861
|
+
return gs
|
2862
|
+
|
2863
|
+
#! cross_valid
|
2531
2864
|
if cv_level in ["low", "simple", "s", "l"]:
|
2532
2865
|
param_grids = {
|
2533
2866
|
"Random Forest": (
|
@@ -2696,7 +3029,73 @@ def predict(
|
|
2696
3029
|
'alpha': [0.1],
|
2697
3030
|
'max_iter': [100],},
|
2698
3031
|
"Poisson":{'alpha': [0.1],
|
2699
|
-
'max_iter': [100],}
|
3032
|
+
'max_iter': [100],},
|
3033
|
+
"Lars": {"n_nonzero_coefs": [10, 50, None]},
|
3034
|
+
"LassoLars": {
|
3035
|
+
"alpha": [0.01, 0.1, 1]
|
3036
|
+
},
|
3037
|
+
"BayesianRidge": {
|
3038
|
+
"alpha_1": [1e-6, 1e-4, 1e-2],
|
3039
|
+
"lambda_1": [1e-6, 1e-4, 1e-2]
|
3040
|
+
},
|
3041
|
+
"GammaRegressor": {
|
3042
|
+
"alpha": [0.1, 1, 10]
|
3043
|
+
},
|
3044
|
+
"TweedieRegressor": {
|
3045
|
+
"alpha": [0.1, 1, 10],
|
3046
|
+
"power": [1, 1.5, 2]
|
3047
|
+
},
|
3048
|
+
"LassoCV": {
|
3049
|
+
"cv": [5]
|
3050
|
+
},
|
3051
|
+
"ElasticNetCV": {
|
3052
|
+
"l1_ratio": [0.2, 0.5, 0.8],
|
3053
|
+
"cv": [5]
|
3054
|
+
},
|
3055
|
+
"LassoLarsCV": {
|
3056
|
+
"cv": [5]
|
3057
|
+
},
|
3058
|
+
"LarsCV": {
|
3059
|
+
"cv": [5]
|
3060
|
+
},
|
3061
|
+
"OrthogonalMatchingPursuit": {
|
3062
|
+
"n_nonzero_coefs": [10, 50, None]
|
3063
|
+
},
|
3064
|
+
"OrthogonalMatchingPursuitCV": {
|
3065
|
+
"cv": [5]
|
3066
|
+
},
|
3067
|
+
"PassiveAggressiveRegressor": {
|
3068
|
+
"C": [0.1, 1, 10]
|
3069
|
+
},
|
3070
|
+
"LinearSVR": {
|
3071
|
+
"C": [0.1, 1, 10]
|
3072
|
+
},
|
3073
|
+
"NuSVR": {
|
3074
|
+
"C": [0.1, 1, 10]
|
3075
|
+
},
|
3076
|
+
"DecisionTreeRegressor": {
|
3077
|
+
"max_depth": [5, 10, None]
|
3078
|
+
},
|
3079
|
+
"ExtraTreeRegressor": {
|
3080
|
+
"max_depth": [5, 10, None]
|
3081
|
+
},
|
3082
|
+
"HistGradientBoostingRegressor": {
|
3083
|
+
"learning_rate": [0.05, 0.1, 0.2],
|
3084
|
+
"max_depth": [5, 10, None]
|
3085
|
+
},
|
3086
|
+
"GaussianProcessRegressor": {
|
3087
|
+
"alpha": [1e-5, 1e-2, 0.1]
|
3088
|
+
},
|
3089
|
+
"KernelRidge": {
|
3090
|
+
"alpha": [0.1, 1, 10],
|
3091
|
+
"kernel": ["linear", "rbf"]
|
3092
|
+
},
|
3093
|
+
"DummyRegressor": {
|
3094
|
+
"strategy": ["mean", "median"]
|
3095
|
+
},
|
3096
|
+
"TransformedTargetRegressor": {
|
3097
|
+
"regressor__fit_intercept": [True, False]
|
3098
|
+
}
|
2700
3099
|
}
|
2701
3100
|
elif cv_level in ["high", "advanced", "h"]:
|
2702
3101
|
param_grids = {
|
@@ -2901,7 +3300,96 @@ def predict(
|
|
2901
3300
|
'alpha': [0.1, 1.0, 10.0],
|
2902
3301
|
'max_iter': [100, 200, 300],},
|
2903
3302
|
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2904
|
-
'max_iter': [100, 200, 300],}
|
3303
|
+
'max_iter': [100, 200, 300],},
|
3304
|
+
"Lars": {
|
3305
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3306
|
+
},
|
3307
|
+
"LassoLars": {
|
3308
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3309
|
+
},
|
3310
|
+
"BayesianRidge": {
|
3311
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3312
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3313
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3314
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3315
|
+
},
|
3316
|
+
"GammaRegressor": {
|
3317
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3318
|
+
"max_iter": [1000, 5000, 10000]
|
3319
|
+
},
|
3320
|
+
"TweedieRegressor": {
|
3321
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3322
|
+
"power": [0, 1, 1.5, 2, 3]
|
3323
|
+
},
|
3324
|
+
"LassoCV": {
|
3325
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3326
|
+
"cv": [3, 5, 10]
|
3327
|
+
},
|
3328
|
+
"ElasticNetCV": {
|
3329
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3330
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3331
|
+
"cv": [3, 5, 10]
|
3332
|
+
},
|
3333
|
+
"LassoLarsCV": {
|
3334
|
+
"cv": [3, 5, 10]
|
3335
|
+
},
|
3336
|
+
"LarsCV": {
|
3337
|
+
"cv": [3, 5, 10]
|
3338
|
+
},
|
3339
|
+
"OrthogonalMatchingPursuit": {
|
3340
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3341
|
+
},
|
3342
|
+
"OrthogonalMatchingPursuitCV": {
|
3343
|
+
"cv": [3, 5, 10]
|
3344
|
+
},
|
3345
|
+
"PassiveAggressiveRegressor": {
|
3346
|
+
"C": [0.01, 0.1, 1, 10],
|
3347
|
+
"max_iter": [1000, 5000, 10000],
|
3348
|
+
"early_stopping": [True, False]
|
3349
|
+
},
|
3350
|
+
"LinearSVR": {
|
3351
|
+
"C": [0.01, 0.1, 1, 10],
|
3352
|
+
"epsilon": [0.01, 0.1, 1],
|
3353
|
+
"max_iter": [1000, 5000, 10000]
|
3354
|
+
},
|
3355
|
+
"NuSVR": {
|
3356
|
+
"C": [0.01, 0.1, 1, 10],
|
3357
|
+
"nu": [0.25, 0.5, 0.75],
|
3358
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3359
|
+
},
|
3360
|
+
"DecisionTreeRegressor": {
|
3361
|
+
"max_depth": [None, 5, 10, 20],
|
3362
|
+
"min_samples_split": [2, 5, 10],
|
3363
|
+
"min_samples_leaf": [1, 2, 4]
|
3364
|
+
},
|
3365
|
+
"ExtraTreeRegressor": {
|
3366
|
+
"max_depth": [None, 5, 10, 20],
|
3367
|
+
"min_samples_split": [2, 5, 10],
|
3368
|
+
"min_samples_leaf": [1, 2, 4]
|
3369
|
+
},
|
3370
|
+
"HistGradientBoostingRegressor": {
|
3371
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3372
|
+
"max_iter": [100, 500, 1000],
|
3373
|
+
"max_depth": [None, 5, 10, 20],
|
3374
|
+
"min_samples_leaf": [1, 2, 4]
|
3375
|
+
},
|
3376
|
+
"GaussianProcessRegressor": {
|
3377
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3378
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3379
|
+
},
|
3380
|
+
"KernelRidge": {
|
3381
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3382
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3383
|
+
"degree": [2, 3, 4]
|
3384
|
+
},
|
3385
|
+
"DummyRegressor": {
|
3386
|
+
"strategy": ["mean", "median", "constant"],
|
3387
|
+
"constant": [0] # Only if strategy is 'constant'
|
3388
|
+
},
|
3389
|
+
"TransformedTargetRegressor": {
|
3390
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3391
|
+
"regressor__fit_intercept": [True, False]
|
3392
|
+
}
|
2905
3393
|
}
|
2906
3394
|
else: # median level
|
2907
3395
|
param_grids = {
|
@@ -3148,7 +3636,96 @@ def predict(
|
|
3148
3636
|
'alpha': [0.1, 1.0],
|
3149
3637
|
'max_iter': [100, 200],},
|
3150
3638
|
"Poisson":{'alpha': [0.1, 1.0],
|
3151
|
-
'max_iter': [100, 200],}
|
3639
|
+
'max_iter': [100, 200],},
|
3640
|
+
"Lars": {
|
3641
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3642
|
+
},
|
3643
|
+
"LassoLars": {
|
3644
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3645
|
+
},
|
3646
|
+
"BayesianRidge": {
|
3647
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3648
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3649
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3650
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3651
|
+
},
|
3652
|
+
"GammaRegressor": {
|
3653
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3654
|
+
"max_iter": [1000, 5000, 10000]
|
3655
|
+
},
|
3656
|
+
"TweedieRegressor": {
|
3657
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3658
|
+
"power": [0, 1, 1.5, 2, 3]
|
3659
|
+
},
|
3660
|
+
"LassoCV": {
|
3661
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3662
|
+
"cv": [3, 5, 10]
|
3663
|
+
},
|
3664
|
+
"ElasticNetCV": {
|
3665
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3666
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3667
|
+
"cv": [3, 5, 10]
|
3668
|
+
},
|
3669
|
+
"LassoLarsCV": {
|
3670
|
+
"cv": [3, 5, 10]
|
3671
|
+
},
|
3672
|
+
"LarsCV": {
|
3673
|
+
"cv": [3, 5, 10]
|
3674
|
+
},
|
3675
|
+
"OrthogonalMatchingPursuit": {
|
3676
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3677
|
+
},
|
3678
|
+
"OrthogonalMatchingPursuitCV": {
|
3679
|
+
"cv": [3, 5, 10]
|
3680
|
+
},
|
3681
|
+
"PassiveAggressiveRegressor": {
|
3682
|
+
"C": [0.01, 0.1, 1, 10],
|
3683
|
+
"max_iter": [1000, 5000, 10000],
|
3684
|
+
"early_stopping": [True, False]
|
3685
|
+
},
|
3686
|
+
"LinearSVR": {
|
3687
|
+
"C": [0.01, 0.1, 1, 10],
|
3688
|
+
"epsilon": [0.01, 0.1, 1],
|
3689
|
+
"max_iter": [1000, 5000, 10000]
|
3690
|
+
},
|
3691
|
+
"NuSVR": {
|
3692
|
+
"C": [0.01, 0.1, 1, 10],
|
3693
|
+
"nu": [0.25, 0.5, 0.75],
|
3694
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3695
|
+
},
|
3696
|
+
"DecisionTreeRegressor": {
|
3697
|
+
"max_depth": [None, 5, 10, 20],
|
3698
|
+
"min_samples_split": [2, 5, 10],
|
3699
|
+
"min_samples_leaf": [1, 2, 4]
|
3700
|
+
},
|
3701
|
+
"ExtraTreeRegressor": {
|
3702
|
+
"max_depth": [None, 5, 10, 20],
|
3703
|
+
"min_samples_split": [2, 5, 10],
|
3704
|
+
"min_samples_leaf": [1, 2, 4]
|
3705
|
+
},
|
3706
|
+
"HistGradientBoostingRegressor": {
|
3707
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3708
|
+
"max_iter": [100, 500, 1000],
|
3709
|
+
"max_depth": [None, 5, 10, 20],
|
3710
|
+
"min_samples_leaf": [1, 2, 4]
|
3711
|
+
},
|
3712
|
+
"GaussianProcessRegressor": {
|
3713
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3714
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3715
|
+
},
|
3716
|
+
"KernelRidge": {
|
3717
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3718
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3719
|
+
"degree": [2, 3, 4]
|
3720
|
+
},
|
3721
|
+
"DummyRegressor": {
|
3722
|
+
"strategy": ["mean", "median", "constant"],
|
3723
|
+
"constant": [0] # Only if strategy is 'constant'
|
3724
|
+
},
|
3725
|
+
"TransformedTargetRegressor": {
|
3726
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3727
|
+
"regressor__fit_intercept": [True, False]
|
3728
|
+
}
|
3152
3729
|
}
|
3153
3730
|
|
3154
3731
|
results = {}
|
@@ -3158,7 +3735,7 @@ def predict(
|
|
3158
3735
|
if purpose == "classification"
|
3159
3736
|
else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
3160
3737
|
)
|
3161
|
-
|
3738
|
+
|
3162
3739
|
# Train and validate each model
|
3163
3740
|
for name, clf in tqdm(
|
3164
3741
|
models.items(),
|
@@ -3168,83 +3745,132 @@ def predict(
|
|
3168
3745
|
):
|
3169
3746
|
if verbose:
|
3170
3747
|
print(f"\nTraining and validating {name}:")
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
verbose=verbose,
|
3185
|
-
)
|
3186
|
-
|
3187
|
-
gs.fit(x_train, y_train)
|
3188
|
-
best_clf = gs.best_estimator_
|
3189
|
-
# make sure x_train and x_test has the same name
|
3190
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3191
|
-
y_pred = best_clf.predict(x_true)
|
3192
|
-
if hasattr(best_clf, "predict_proba"):
|
3193
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3194
|
-
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3195
|
-
if y_pred_proba.shape[1] == 1:
|
3196
|
-
y_pred_proba = np.hstack(
|
3197
|
-
[1 - y_pred_proba, y_pred_proba]
|
3198
|
-
) # Add missing class probabilities
|
3199
|
-
y_pred_proba = y_pred_proba[:, 1]
|
3200
|
-
elif hasattr(best_clf, "decision_function"):
|
3201
|
-
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3202
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3203
|
-
# Ensure y_pred_proba is within 0 and 1 bounds
|
3204
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3205
|
-
y_pred_proba.max() - y_pred_proba.min()
|
3748
|
+
try:
|
3749
|
+
if is_binary:
|
3750
|
+
gs = GridSearchCV(
|
3751
|
+
clf,
|
3752
|
+
param_grid=param_grids.get(name, {}),
|
3753
|
+
scoring=(
|
3754
|
+
"roc_auc"
|
3755
|
+
if purpose == "classification"
|
3756
|
+
else "neg_mean_squared_error"
|
3757
|
+
),
|
3758
|
+
cv=cv,
|
3759
|
+
n_jobs=n_jobs,
|
3760
|
+
verbose=verbose,
|
3206
3761
|
)
|
3207
|
-
else:
|
3208
|
-
y_pred_proba = None # No probability output for certain models
|
3209
|
-
else:
|
3210
|
-
gs = GridSearchCV(
|
3211
|
-
clf,
|
3212
|
-
param_grid=param_grids.get(name, {}),
|
3213
|
-
scoring=(
|
3214
|
-
"roc_auc_ovr"
|
3215
|
-
if purpose == "classification"
|
3216
|
-
else "neg_mean_squared_error"
|
3217
|
-
),
|
3218
|
-
cv=cv,
|
3219
|
-
n_jobs=n_jobs,
|
3220
|
-
verbose=verbose,
|
3221
|
-
)
|
3222
3762
|
|
3223
|
-
|
3224
|
-
|
3225
|
-
|
3226
|
-
|
3227
|
-
|
3228
|
-
|
3229
|
-
|
3230
|
-
|
3231
|
-
|
3232
|
-
|
3233
|
-
|
3234
|
-
|
3235
|
-
|
3236
|
-
|
3237
|
-
|
3238
|
-
|
3239
|
-
|
3240
|
-
|
3241
|
-
|
3242
|
-
y_pred_proba
|
3243
|
-
|
3763
|
+
gs.fit(x_train, y_train)
|
3764
|
+
best_clf = gs.best_estimator_
|
3765
|
+
|
3766
|
+
# make sure x_train and x_test has the same name
|
3767
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3768
|
+
y_pred = best_clf.predict(x_true)
|
3769
|
+
if hasattr(best_clf, "predict_proba"):
|
3770
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3771
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3772
|
+
if y_pred_proba.shape[1] == 1:
|
3773
|
+
y_pred_proba = np.hstack(
|
3774
|
+
[1 - y_pred_proba, y_pred_proba]
|
3775
|
+
) # Add missing class probabilities
|
3776
|
+
if y_pred_proba.shape[1] == 2:
|
3777
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
3778
|
+
y_pred_proba = y_pred_proba.iloc[:, 1]
|
3779
|
+
elif isinstance(y_pred_proba, pd.Series):
|
3780
|
+
y_pred_proba = y_pred_proba.values[:, 1]
|
3781
|
+
else:
|
3782
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3783
|
+
|
3784
|
+
elif hasattr(best_clf, "decision_function"):
|
3785
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3786
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3787
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
3788
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3789
|
+
y_pred_proba.max() - y_pred_proba.min()
|
3244
3790
|
)
|
3245
|
-
|
3246
|
-
|
3791
|
+
else:
|
3792
|
+
y_pred_proba = None # No probability output for certain models
|
3793
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3794
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3795
|
+
if hasattr(best_clf, "alphas_"):
|
3796
|
+
alphas_ = best_clf.alphas_
|
3797
|
+
elif hasattr(best_clf, "alpha_"):
|
3798
|
+
alphas_ = best_clf.alpha_
|
3799
|
+
elif hasattr(best_clf, "Cs_"):
|
3800
|
+
alphas_ = best_clf.Cs_
|
3801
|
+
else:
|
3802
|
+
alphas_= None
|
3803
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3804
|
+
else:
|
3805
|
+
gs = GridSearchCV(
|
3806
|
+
clf,
|
3807
|
+
param_grid=param_grids.get(name, {}),
|
3808
|
+
scoring=(
|
3809
|
+
"roc_auc_ovr"
|
3810
|
+
if purpose == "classification"
|
3811
|
+
else "neg_mean_squared_error"
|
3812
|
+
),
|
3813
|
+
cv=cv,
|
3814
|
+
n_jobs=n_jobs,
|
3815
|
+
verbose=verbose,
|
3816
|
+
)
|
3247
3817
|
|
3818
|
+
# Fit GridSearchCV
|
3819
|
+
gs.fit(x_train, y_train)
|
3820
|
+
best_clf = gs.best_estimator_
|
3821
|
+
|
3822
|
+
# Ensure x_true aligns with x_train columns
|
3823
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3824
|
+
|
3825
|
+
# do i need to fit the x_train, y_train again?
|
3826
|
+
best_clf=best_clf.fit(x_train, y_train)
|
3827
|
+
y_pred = best_clf.predict(x_true)
|
3828
|
+
|
3829
|
+
# Handle prediction probabilities for multiclass
|
3830
|
+
if hasattr(best_clf, "predict_proba"):
|
3831
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3832
|
+
elif hasattr(best_clf, "decision_function"):
|
3833
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3834
|
+
|
3835
|
+
# Normalize for multiclass if necessary
|
3836
|
+
if y_pred_proba.ndim == 2:
|
3837
|
+
y_pred_proba = (
|
3838
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3839
|
+
) / (
|
3840
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3841
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3842
|
+
)
|
3843
|
+
else:
|
3844
|
+
y_pred_proba = None # No probability output for certain models
|
3845
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3846
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3847
|
+
if hasattr(best_clf, "alphas_"):
|
3848
|
+
alphas_ = best_clf.alphas_
|
3849
|
+
elif hasattr(best_clf, "alpha_"):
|
3850
|
+
alphas_ = best_clf.alpha_
|
3851
|
+
elif hasattr(best_clf, "Cs_"):
|
3852
|
+
alphas_ = best_clf.Cs_
|
3853
|
+
else:
|
3854
|
+
alphas_= None
|
3855
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3856
|
+
except Exception as e:
|
3857
|
+
alphas_,coef_ = None,None
|
3858
|
+
print(f"skiped {clf}: {e}")
|
3859
|
+
continue
|
3860
|
+
|
3861
|
+
# try to make predict format consistant
|
3862
|
+
try:
|
3863
|
+
y_pred= [i[0] for i in y_pred]
|
3864
|
+
except:
|
3865
|
+
pass
|
3866
|
+
try:
|
3867
|
+
y_true= [i[0] for i in y_true]
|
3868
|
+
except:
|
3869
|
+
pass
|
3870
|
+
try:
|
3871
|
+
y_train= [i[0] for i in y_train]
|
3872
|
+
except:
|
3873
|
+
pass
|
3248
3874
|
validation_scores = {}
|
3249
3875
|
|
3250
3876
|
if y_true is not None and y_pred_proba is not None:
|
@@ -3294,20 +3920,26 @@ def predict(
|
|
3294
3920
|
"roc_curve": roc_info,
|
3295
3921
|
"pr_curve": pr_info,
|
3296
3922
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3297
|
-
"predictions": y_pred
|
3923
|
+
"predictions": y_pred,#.tolist(),
|
3298
3924
|
"predictions_proba": (
|
3299
3925
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3300
3926
|
),
|
3927
|
+
"features":share_col_names,
|
3928
|
+
"coef":coef_,
|
3929
|
+
"alphas":alphas_
|
3301
3930
|
}
|
3302
3931
|
else: # "regression"
|
3303
3932
|
results[name] = {
|
3304
3933
|
"best_clf": gs.best_estimator_,
|
3305
3934
|
"best_params": gs.best_params_,
|
3306
3935
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3307
|
-
"predictions": y_pred
|
3936
|
+
"predictions": y_pred,#.tolist(),
|
3308
3937
|
"predictions_proba": (
|
3309
3938
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3310
3939
|
),
|
3940
|
+
"features":share_col_names,
|
3941
|
+
"coef":coef_,
|
3942
|
+
"alphas":alphas_
|
3311
3943
|
}
|
3312
3944
|
else: # multi-classes
|
3313
3945
|
if y_pred_proba is not None:
|
@@ -3346,20 +3978,26 @@ def predict(
|
|
3346
3978
|
"roc_curve": roc_info,
|
3347
3979
|
"pr_curve": pr_info,
|
3348
3980
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3349
|
-
"predictions": y_pred
|
3981
|
+
"predictions": y_pred,#.tolist(),
|
3350
3982
|
"predictions_proba": (
|
3351
3983
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3352
3984
|
),
|
3985
|
+
"features":share_col_names,
|
3986
|
+
"coef":coef_,
|
3987
|
+
"alphas":alphas_
|
3353
3988
|
}
|
3354
3989
|
else: # "regression"
|
3355
3990
|
results[name] = {
|
3356
3991
|
"best_clf": gs.best_estimator_,
|
3357
3992
|
"best_params": gs.best_params_,
|
3358
3993
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3359
|
-
"predictions": y_pred
|
3994
|
+
"predictions": y_pred,#.tolist(),
|
3360
3995
|
"predictions_proba": (
|
3361
3996
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3362
3997
|
),
|
3998
|
+
"features":share_col_names,
|
3999
|
+
"coef":coef_,
|
4000
|
+
"alphas":alphas_
|
3363
4001
|
}
|
3364
4002
|
|
3365
4003
|
else:
|
@@ -3378,17 +4016,21 @@ def predict(
|
|
3378
4016
|
"best_clf": gs.best_estimator_,
|
3379
4017
|
"best_params": gs.best_params_,
|
3380
4018
|
"scores": validation_scores,
|
3381
|
-
"predictions": y_pred
|
4019
|
+
"predictions": y_pred,#.tolist(),
|
3382
4020
|
"predictions_proba": (
|
3383
4021
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3384
4022
|
),
|
4023
|
+
"features":share_col_names,
|
3385
4024
|
"y_train": y_train if y_train is not None else [],
|
3386
4025
|
"y_true": y_true if y_true is not None else [],
|
4026
|
+
"coef":coef_,
|
4027
|
+
"alphas":alphas_
|
3387
4028
|
}
|
3388
4029
|
|
3389
4030
|
# Convert results to DataFrame
|
3390
4031
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3391
|
-
|
4032
|
+
display(df_results)
|
4033
|
+
# sort
|
3392
4034
|
if y_true is not None:
|
3393
4035
|
if purpose == "classification":
|
3394
4036
|
df_scores = pd.DataFrame(
|
@@ -3446,7 +4088,7 @@ def predict(
|
|
3446
4088
|
for i, j in top_models.to_dict().items():
|
3447
4089
|
base_estimators.append((i, j))
|
3448
4090
|
if stacking_cv:
|
3449
|
-
print(f"
|
4091
|
+
print(f"⤵ stacking_cv is processing...")
|
3450
4092
|
#* 定义几个象征性的final_estimator
|
3451
4093
|
# 备选的几种
|
3452
4094
|
if purpose == "classification":
|
@@ -3520,7 +4162,7 @@ def predict(
|
|
3520
4162
|
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
4163
|
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
4164
|
else:
|
3523
|
-
print(f"
|
4165
|
+
print(f"⤵ trying to find the best_final_estimator for stacking...")
|
3524
4166
|
if purpose=="classification":
|
3525
4167
|
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
4168
|
random_state=random_state,
|
@@ -3530,26 +4172,25 @@ def predict(
|
|
3530
4172
|
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
4173
|
#! apply stacking
|
3532
4174
|
if purpose == "classification":
|
3533
|
-
print(f"
|
4175
|
+
print(f"⤵ StackingClassifier...")
|
3534
4176
|
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
4177
|
final_estimator=best_final_estimator,
|
3536
4178
|
cv=cv)
|
3537
4179
|
else:
|
3538
|
-
print(f"
|
4180
|
+
print(f"⤵ StackingRegressor...")
|
3539
4181
|
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
4182
|
final_estimator=best_final_estimator,
|
3541
4183
|
cv=cv)
|
3542
4184
|
|
3543
4185
|
# Train the Stacking Classifier
|
3544
|
-
print(f"
|
4186
|
+
print(f"⤵ fit & predict...")
|
3545
4187
|
stacking_model.fit(x_train, y_train)
|
3546
4188
|
y_pred_final = stacking_model.predict(x_true)
|
3547
|
-
print(f"
|
4189
|
+
print(f"⤵ collecting results...")
|
3548
4190
|
# pred_proba
|
3549
4191
|
if is_binary:
|
3550
4192
|
if hasattr(stacking_model, "predict_proba"):
|
3551
4193
|
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
-
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
4194
|
if y_pred_proba_final.shape[1] == 1:
|
3554
4195
|
y_pred_proba_final = np.hstack(
|
3555
4196
|
[1 - y_pred_proba_final, y_pred_proba_final]
|
@@ -3564,6 +4205,17 @@ def predict(
|
|
3564
4205
|
)
|
3565
4206
|
else:
|
3566
4207
|
y_pred_proba_final = None # No probability output for certain models
|
4208
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
4209
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
4210
|
+
if hasattr(best_clf, "alphas_"):
|
4211
|
+
alphas_ = best_clf.alphas_
|
4212
|
+
elif hasattr(best_clf, "alpha_"):
|
4213
|
+
alphas_ = best_clf.alpha_
|
4214
|
+
elif hasattr(best_clf, "Cs_"):
|
4215
|
+
alphas_ = best_clf.Cs_
|
4216
|
+
else:
|
4217
|
+
alphas_= None
|
4218
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3567
4219
|
if not is_binary:
|
3568
4220
|
# Handle prediction probabilities for multiclass
|
3569
4221
|
if hasattr(stacking_model, "predict_proba"):
|
@@ -3581,6 +4233,17 @@ def predict(
|
|
3581
4233
|
)
|
3582
4234
|
else:
|
3583
4235
|
y_pred_proba_final = None # No probability output for certain models
|
4236
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
4237
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
4238
|
+
if hasattr(best_clf, "alphas_"):
|
4239
|
+
alphas_ = best_clf.alphas_
|
4240
|
+
elif hasattr(best_clf, "alpha_"):
|
4241
|
+
alphas_ = best_clf.alpha_
|
4242
|
+
elif hasattr(best_clf, "Cs_"):
|
4243
|
+
alphas_ = best_clf.Cs_
|
4244
|
+
else:
|
4245
|
+
alphas_= None
|
4246
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3584
4247
|
#! dict_pred_stack
|
3585
4248
|
dict_pred_stack={}
|
3586
4249
|
validation_scores_final = {}
|
@@ -3631,6 +4294,9 @@ def predict(
|
|
3631
4294
|
"predictions_proba": (
|
3632
4295
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
4296
|
),
|
4297
|
+
"features":share_col_names,
|
4298
|
+
"coef":coef_,
|
4299
|
+
"alphas":alphas_
|
3634
4300
|
}
|
3635
4301
|
else: # "regression"
|
3636
4302
|
dict_pred_stack = {
|
@@ -3641,6 +4307,9 @@ def predict(
|
|
3641
4307
|
"predictions_proba": (
|
3642
4308
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
4309
|
),
|
4310
|
+
"features":share_col_names,
|
4311
|
+
"coef":coef_,
|
4312
|
+
"alphas":alphas_
|
3644
4313
|
}
|
3645
4314
|
else: # multi-classes
|
3646
4315
|
if y_pred_proba_final is not None:
|
@@ -3680,6 +4349,9 @@ def predict(
|
|
3680
4349
|
"predictions_proba": (
|
3681
4350
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
4351
|
),
|
4352
|
+
"features":share_col_names,
|
4353
|
+
"coef":coef_,
|
4354
|
+
"alphas":alphas_
|
3683
4355
|
}
|
3684
4356
|
else: # "regression"
|
3685
4357
|
dict_pred_stack = {
|
@@ -3690,6 +4362,9 @@ def predict(
|
|
3690
4362
|
"predictions_proba": (
|
3691
4363
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
4364
|
),
|
4365
|
+
"features":share_col_names,
|
4366
|
+
"coef":coef_,
|
4367
|
+
"alphas":alphas_
|
3693
4368
|
}
|
3694
4369
|
|
3695
4370
|
else:
|
@@ -3712,8 +4387,11 @@ def predict(
|
|
3712
4387
|
"predictions_proba": (
|
3713
4388
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
4389
|
),
|
4390
|
+
"features":share_col_names,
|
3715
4391
|
"y_train": y_train if y_train is not None else [],
|
3716
4392
|
"y_true": y_true if y_true is not None else [],
|
4393
|
+
"coef":coef_,
|
4394
|
+
"alphas":alphas_
|
3717
4395
|
}
|
3718
4396
|
# merge together
|
3719
4397
|
df_pred = pd.DataFrame(
|
@@ -3728,16 +4406,16 @@ def predict(
|
|
3728
4406
|
# if dir_save:
|
3729
4407
|
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
4408
|
if vote:
|
3731
|
-
print(f"
|
4409
|
+
print(f"⤵ voting...")
|
3732
4410
|
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
-
#!
|
4411
|
+
#! voting
|
3734
4412
|
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
4413
|
base_estimators=[]
|
3736
4414
|
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
4415
|
base_estimators.append((name,cls))
|
3738
4416
|
# Apply Voting Classifier/Regressor
|
3739
4417
|
if purpose == "classification":
|
3740
|
-
print(f"
|
4418
|
+
print(f"⤵ VotingClassifier...via{voting}")
|
3741
4419
|
if voting=='hard':
|
3742
4420
|
# Hard voting does not support `predict_proba`
|
3743
4421
|
voting_model = VotingClassifier(estimators=base_estimators)
|
@@ -3745,7 +4423,7 @@ def predict(
|
|
3745
4423
|
# Soft voting supports `predict_proba`
|
3746
4424
|
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
4425
|
else:
|
3748
|
-
print(f"
|
4426
|
+
print(f"⤵ VotingRegressor...")
|
3749
4427
|
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
4428
|
|
3751
4429
|
# Train the Voting Classifier/Regressor
|
@@ -3770,10 +4448,23 @@ def predict(
|
|
3770
4448
|
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
4449
|
else:
|
3772
4450
|
y_pred_proba_vote = None
|
4451
|
+
|
4452
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
4453
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
4454
|
+
if hasattr(best_clf, "alphas_"):
|
4455
|
+
alphas_ = best_clf.alphas_
|
4456
|
+
elif hasattr(best_clf, "alpha_"):
|
4457
|
+
alphas_ = best_clf.alpha_
|
4458
|
+
elif hasattr(best_clf, "Cs_"):
|
4459
|
+
alphas_ = best_clf.Cs_
|
4460
|
+
else:
|
4461
|
+
alphas_= None
|
4462
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3773
4463
|
else: # Regression
|
3774
4464
|
y_pred_proba_vote = None
|
4465
|
+
coef_,alphas_=None,None
|
3775
4466
|
|
3776
|
-
print(f"
|
4467
|
+
print(f"⤵ collecting voting results...")
|
3777
4468
|
#! dict_pred_vote
|
3778
4469
|
dict_pred_vote = {}
|
3779
4470
|
validation_scores_vote = {}
|
@@ -3822,6 +4513,9 @@ def predict(
|
|
3822
4513
|
"predictions_proba": (
|
3823
4514
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
4515
|
),
|
4516
|
+
"features":share_col_names,
|
4517
|
+
"coef":coef_,
|
4518
|
+
"alphas":alphas_
|
3825
4519
|
}
|
3826
4520
|
else: # Multi-class
|
3827
4521
|
if y_pred_proba_vote is not None:
|
@@ -3856,6 +4550,9 @@ def predict(
|
|
3856
4550
|
"predictions_proba": (
|
3857
4551
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
4552
|
),
|
4553
|
+
"features":share_col_names,
|
4554
|
+
"coef":coef_,
|
4555
|
+
"alphas":alphas_
|
3859
4556
|
}
|
3860
4557
|
else:
|
3861
4558
|
if y_true is None:
|
@@ -3877,6 +4574,7 @@ def predict(
|
|
3877
4574
|
"predictions_proba": (
|
3878
4575
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
4576
|
),
|
4577
|
+
"features":share_col_names,
|
3880
4578
|
"y_train": y_train if y_train is not None else [],
|
3881
4579
|
"y_true": y_true if y_true is not None else [],
|
3882
4580
|
}
|
@@ -3900,6 +4598,8 @@ def predict(
|
|
3900
4598
|
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
3901
4599
|
elif stack:
|
3902
4600
|
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
4601
|
+
else:
|
4602
|
+
df_res=df_results
|
3903
4603
|
|
3904
4604
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3905
4605
|
from datetime import datetime
|
@@ -3907,9 +4607,15 @@ def predict(
|
|
3907
4607
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3908
4608
|
# try:
|
3909
4609
|
if df_res.shape[0] > 3:
|
3910
|
-
|
4610
|
+
try:
|
4611
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
4612
|
+
except Exception as e:
|
4613
|
+
print(e)
|
3911
4614
|
else:
|
3912
|
-
|
4615
|
+
try:
|
4616
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
4617
|
+
except Exception as e:
|
4618
|
+
print(e)
|
3913
4619
|
if dir_save:
|
3914
4620
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3915
4621
|
# except Exception as e:
|