py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +496 -138
- py2ls/ml2ls.py +994 -288
- py2ls/netfinder.py +16 -20
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +1244 -158
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +17 -14
- py2ls/data/usages_pd copy.json +0 -1105
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
|
|
31
31
|
average_precision_score,
|
32
32
|
)
|
33
33
|
from typing import Dict, Any, Optional, List, Union
|
34
|
+
import os, json
|
34
35
|
import numpy as np
|
35
36
|
import pandas as pd
|
36
37
|
from . import ips
|
@@ -49,7 +50,13 @@ logger = logging.getLogger()
|
|
49
50
|
warnings.filterwarnings("ignore", category=UserWarning)
|
50
51
|
from sklearn.tree import DecisionTreeClassifier
|
51
52
|
from sklearn.neighbors import KNeighborsClassifier
|
52
|
-
|
53
|
+
#* set random_state global
|
54
|
+
import torch
|
55
|
+
import random
|
56
|
+
random_state=1
|
57
|
+
random.seed(random_state)
|
58
|
+
np.random.seed(random_state)
|
59
|
+
torch.manual_seed(random_state)
|
53
60
|
|
54
61
|
def features_knn(
|
55
62
|
x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
|
@@ -594,7 +601,7 @@ def get_features(
|
|
594
601
|
"""
|
595
602
|
from sklearn.compose import ColumnTransformer
|
596
603
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
597
|
-
|
604
|
+
from sklearn.model_selection import train_test_split
|
598
605
|
# Ensure X and y are DataFrames/Series for consistency
|
599
606
|
if isinstance(X, np.ndarray):
|
600
607
|
X = pd.DataFrame(X)
|
@@ -922,10 +929,26 @@ def get_features(
|
|
922
929
|
"feature_importances": feature_importances,
|
923
930
|
}
|
924
931
|
if all([plot_, dir_save]):
|
932
|
+
|
925
933
|
from datetime import datetime
|
926
|
-
|
927
934
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
928
935
|
ips.figsave(dir_save + f"features{now_}.pdf")
|
936
|
+
|
937
|
+
lists = []
|
938
|
+
for tp in ips.flatten(features_df["type"]):
|
939
|
+
lists.append(
|
940
|
+
features_df
|
941
|
+
.loc[features_df["type"] == tp, "feature"]
|
942
|
+
.tolist()
|
943
|
+
)
|
944
|
+
labels = ips.flatten(features_df["type"])
|
945
|
+
# current_fig = plt.gcf()
|
946
|
+
# # ax = current_fig.add_subplot(3, 2, 6)
|
947
|
+
# gs = current_fig.add_gridspec(3, 2)
|
948
|
+
# ax = current_fig.add_subplot(gs[:, :])
|
949
|
+
plt.figure(figsize=[6,6])
|
950
|
+
plot.venn(lists, labels, cmap="coolwarm")
|
951
|
+
ips.figsave(dir_save + f"features{now_}shared_features.pdf")
|
929
952
|
else:
|
930
953
|
results = {
|
931
954
|
"selected_features": pd.DataFrame(),
|
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1247
1270
|
nexttile = plot.subplot(figsize=figsize)
|
1248
1271
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1249
1272
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1273
|
+
try:
|
1274
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1275
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1276
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1277
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1278
|
+
plot_roc_curve(
|
1279
|
+
fpr,
|
1280
|
+
tpr,
|
1281
|
+
mean_auc,
|
1282
|
+
lower_ci,
|
1283
|
+
upper_ci,
|
1284
|
+
model_name=model_name,
|
1285
|
+
lw=1.5,
|
1286
|
+
color=colors[i],
|
1287
|
+
alpha=alpha,
|
1288
|
+
ax=ax,
|
1289
|
+
)
|
1290
|
+
except Exception as e:
|
1291
|
+
print(e)
|
1266
1292
|
plot.figsets(
|
1267
1293
|
sp=2,
|
1268
1294
|
legend=dict(
|
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1277
1303
|
|
1278
1304
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1279
1305
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1306
|
+
try:
|
1307
|
+
plot_pr_curve(
|
1308
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1309
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1310
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1311
|
+
model_name=model_name,
|
1312
|
+
color=colors[i],
|
1313
|
+
lw=1.5,
|
1314
|
+
alpha=alpha,
|
1315
|
+
ax=ax,
|
1316
|
+
)
|
1317
|
+
except Exception as e:
|
1318
|
+
print(e)
|
1290
1319
|
plot.figsets(
|
1291
1320
|
sp=2,
|
1292
1321
|
legend=dict(
|
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1314
1343
|
for iclass, class_ in enumerate(classes):
|
1315
1344
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1316
1345
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1346
|
+
try:
|
1347
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1348
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1349
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1350
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1351
|
+
plot_roc_curve(
|
1352
|
+
fpr,
|
1353
|
+
tpr,
|
1354
|
+
mean_auc,
|
1355
|
+
lower_ci,
|
1356
|
+
upper_ci,
|
1357
|
+
model_name=model_name,
|
1358
|
+
lw=1.5,
|
1359
|
+
color=colors[i],
|
1360
|
+
alpha=alpha,
|
1361
|
+
ax=ax,
|
1362
|
+
)
|
1363
|
+
except Exception as e:
|
1364
|
+
print(e)
|
1333
1365
|
plot.figsets(
|
1334
1366
|
sp=2,
|
1335
1367
|
title=class_,
|
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1345
1377
|
|
1346
1378
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1347
1379
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1380
|
+
try:
|
1381
|
+
plot_pr_curve(
|
1382
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1383
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1384
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1385
|
+
iclass
|
1386
|
+
],
|
1387
|
+
model_name=model_name,
|
1388
|
+
color=colors[i],
|
1389
|
+
lw=1.5,
|
1390
|
+
alpha=alpha,
|
1391
|
+
ax=ax,
|
1392
|
+
)
|
1393
|
+
except Exception as e:
|
1394
|
+
print(e)
|
1360
1395
|
plot.figsets(
|
1361
1396
|
sp=2,
|
1362
1397
|
title=class_,
|
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1379
1414
|
len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
|
1380
1415
|
)
|
1381
1416
|
for model_name in ips.flatten(res_val["pr_curve"].index):
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
# Plotting
|
1388
|
-
plot_roc_curve(
|
1389
|
-
fpr,
|
1390
|
-
tpr,
|
1391
|
-
mean_auc,
|
1392
|
-
lower_ci,
|
1393
|
-
upper_ci,
|
1394
|
-
model_name=model_name,
|
1395
|
-
ax=nexttile(),
|
1396
|
-
)
|
1397
|
-
plot.figsets(title=model_name, sp=2)
|
1417
|
+
try:
|
1418
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1419
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1420
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1421
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1398
1422
|
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1423
|
+
# Plotting
|
1424
|
+
plot_roc_curve(
|
1425
|
+
fpr,
|
1426
|
+
tpr,
|
1427
|
+
mean_auc,
|
1428
|
+
lower_ci,
|
1429
|
+
upper_ci,
|
1430
|
+
model_name=model_name,
|
1431
|
+
ax=nexttile(),
|
1432
|
+
)
|
1433
|
+
plot.figsets(title=model_name, sp=2)
|
1407
1434
|
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1435
|
+
plot_pr_binary(
|
1436
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1437
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1438
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1439
|
+
model_name=model_name,
|
1440
|
+
ax=nexttile(),
|
1441
|
+
)
|
1442
|
+
plot.figsets(title=model_name, sp=2)
|
1443
|
+
|
1444
|
+
# plot cm
|
1445
|
+
plot_cm(
|
1446
|
+
res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
|
1447
|
+
)
|
1448
|
+
plot.figsets(title=model_name, sp=2)
|
1449
|
+
|
1450
|
+
except Exception as e:
|
1451
|
+
print(e)
|
1413
1452
|
else:
|
1414
1453
|
|
1415
1454
|
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1424
1463
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1425
1464
|
ax = nexttile()
|
1426
1465
|
for iclass, class_ in enumerate(classes):
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1466
|
+
try:
|
1467
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1468
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1469
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1470
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1471
|
+
plot_roc_curve(
|
1472
|
+
fpr,
|
1473
|
+
tpr,
|
1474
|
+
mean_auc,
|
1475
|
+
lower_ci,
|
1476
|
+
upper_ci,
|
1477
|
+
model_name=class_,
|
1478
|
+
lw=1.5,
|
1479
|
+
color=colors[iclass],
|
1480
|
+
alpha=0.03,
|
1481
|
+
ax=ax,
|
1482
|
+
)
|
1483
|
+
except Exception as e:
|
1484
|
+
print(e)
|
1443
1485
|
plot.figsets(
|
1444
1486
|
sp=2,
|
1445
1487
|
title=model_name,
|
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1451
1493
|
|
1452
1494
|
ax = nexttile()
|
1453
1495
|
for iclass, class_ in enumerate(classes):
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1496
|
+
try:
|
1497
|
+
plot_pr_curve(
|
1498
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1499
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1500
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1501
|
+
iclass
|
1502
|
+
],
|
1503
|
+
model_name=class_,
|
1504
|
+
color=colors[iclass],
|
1505
|
+
lw=1.5,
|
1506
|
+
alpha=0.03,
|
1507
|
+
ax=ax,
|
1508
|
+
)
|
1509
|
+
except Exception as e:
|
1510
|
+
print(e)
|
1466
1511
|
plot.figsets(
|
1467
1512
|
sp=2,
|
1468
1513
|
title=class_,
|
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
|
|
1543
1588
|
# print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
|
1544
1589
|
sorted_scores = np.array(bootstrapped_scores)
|
1545
1590
|
sorted_scores.sort()
|
1546
|
-
|
1547
|
-
# Computing the lower and upper bound of the 90% confidence interval
|
1548
|
-
# You can change the bounds percentiles to 0.025 and 0.975 to get
|
1549
|
-
# a 95% confidence interval instead.
|
1591
|
+
|
1550
1592
|
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1551
1593
|
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1552
1594
|
if verbose:
|
1553
1595
|
print(
|
1554
|
-
"Confidence interval for the score: [{:0.3f} - {:0.
|
1596
|
+
"Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
|
1555
1597
|
confidence_lower, confidence_upper
|
1556
1598
|
)
|
1557
1599
|
)
|
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
|
|
1568
1610
|
y_true, classes=np.unique(y_true)
|
1569
1611
|
) # One-vs-Rest transformation
|
1570
1612
|
n_classes = y_true_bin.shape[1] # Number of classes
|
1571
|
-
|
1572
|
-
bootstrapped_scores = np.
|
1573
|
-
(n_classes, n_bootstraps)
|
1574
|
-
) # Store scores for each class
|
1575
|
-
|
1613
|
+
|
1614
|
+
bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
|
1576
1615
|
if verbose:
|
1577
1616
|
print("AUROC scores for each class:")
|
1578
1617
|
for i in range(n_classes):
|
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
|
|
1592
1631
|
# Calculating the confidence intervals for each class
|
1593
1632
|
confidence_intervals = []
|
1594
1633
|
for class_idx in range(n_classes):
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
)
|
1634
|
+
# rm nan
|
1635
|
+
valid_scores = bootstrapped_scores[class_idx][
|
1636
|
+
~np.isnan(bootstrapped_scores[class_idx])
|
1637
|
+
]
|
1638
|
+
if len(valid_scores) > 0:
|
1639
|
+
sorted_scores = np.sort(valid_scores)
|
1640
|
+
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1641
|
+
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1642
|
+
confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
|
1643
|
+
|
1644
|
+
if verbose:
|
1645
|
+
print(
|
1646
|
+
f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
|
1647
|
+
)
|
1648
|
+
else:
|
1649
|
+
confidence_intervals[class_idx] = (np.nan, np.nan)
|
1650
|
+
if verbose:
|
1651
|
+
print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
|
1604
1652
|
|
1605
1653
|
return confidence_intervals
|
1606
1654
|
|
@@ -2057,20 +2105,20 @@ def rank_models(
|
|
2057
2105
|
|
2058
2106
|
def generate_bar_plot(ax, cv_test_scores):
|
2059
2107
|
ax = plot.plotxy(
|
2060
|
-
y="Classifier", x="combined_score", data=cv_test_scores,
|
2108
|
+
y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
|
2061
2109
|
)
|
2062
2110
|
plt.title("Classifier Performance")
|
2063
2111
|
plt.tight_layout()
|
2064
2112
|
return plt
|
2065
2113
|
|
2066
|
-
nexttile = plot.subplot(2, 2, figsize=[10,
|
2114
|
+
nexttile = plot.subplot(2, 2, figsize=[10, 10])
|
2067
2115
|
generate_bar_plot(nexttile(), top_models.dropna())
|
2068
2116
|
plot.radar(
|
2069
2117
|
ax=nexttile(projection="polar"),
|
2070
2118
|
data=cv_test_scores.set_index("Classifier"),
|
2071
|
-
ylim=[0
|
2072
|
-
color=plot.get_color(
|
2073
|
-
alpha=0.
|
2119
|
+
ylim=[0, 1],
|
2120
|
+
color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
|
2121
|
+
alpha=0.02,
|
2074
2122
|
circular=1,
|
2075
2123
|
)
|
2076
2124
|
return cv_test_scores
|
@@ -2206,6 +2254,8 @@ def predict(
|
|
2206
2254
|
y_train: pd.Series,
|
2207
2255
|
x_true: pd.DataFrame = None,
|
2208
2256
|
y_true: Optional[pd.Series] = None,
|
2257
|
+
fill_missing:bool = True,
|
2258
|
+
scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
|
2209
2259
|
backward: bool = False, # backward_regression
|
2210
2260
|
backward_thr:float = 0.05,# pval thr,only works when backward is True
|
2211
2261
|
common_features: set = None,
|
@@ -2214,7 +2264,7 @@ def predict(
|
|
2214
2264
|
metrics: Optional[List[str]] = None,
|
2215
2265
|
stack:bool=True,# run stacking
|
2216
2266
|
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2217
|
-
vote:bool=
|
2267
|
+
vote:bool=False,# run voting
|
2218
2268
|
voting:str="hard", # only for classification purporse of voting
|
2219
2269
|
n_top_models:int=5, #for stacking models
|
2220
2270
|
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
@@ -2227,7 +2277,12 @@ def predict(
|
|
2227
2277
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2228
2278
|
class_weight: str = "balanced",
|
2229
2279
|
random_state: int = 1,
|
2280
|
+
presets = "best_quality",# specific for autogluon
|
2281
|
+
time_limit=600, # specific for autogluon
|
2282
|
+
num_bag_folds=5, # specific for autogluon
|
2283
|
+
num_stack_levels=2, # specific for autogluon
|
2230
2284
|
verbose: bool = False,
|
2285
|
+
**kwargs
|
2231
2286
|
) -> pd.DataFrame:
|
2232
2287
|
"""
|
2233
2288
|
第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
|
@@ -2278,28 +2333,20 @@ def predict(
|
|
2278
2333
|
RandomForestRegressor,
|
2279
2334
|
ExtraTreesClassifier,
|
2280
2335
|
ExtraTreesRegressor,
|
2336
|
+
HistGradientBoostingRegressor,
|
2281
2337
|
BaggingClassifier,
|
2282
2338
|
BaggingRegressor,
|
2283
2339
|
AdaBoostClassifier,
|
2284
2340
|
AdaBoostRegressor,
|
2285
2341
|
)
|
2286
|
-
from sklearn.svm import SVC, SVR
|
2287
|
-
from sklearn.tree import DecisionTreeRegressor
|
2342
|
+
from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
|
2343
|
+
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
|
2288
2344
|
from sklearn.linear_model import (
|
2289
|
-
LogisticRegression,
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
RidgeClassifierCV,
|
2295
|
-
Perceptron,
|
2296
|
-
SGDClassifier,
|
2297
|
-
RidgeCV,
|
2298
|
-
Ridge,
|
2299
|
-
TheilSenRegressor,
|
2300
|
-
HuberRegressor,
|
2301
|
-
PoissonRegressor,
|
2302
|
-
|
2345
|
+
LogisticRegression,ElasticNet,ElasticNetCV,
|
2346
|
+
LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
|
2347
|
+
RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
|
2348
|
+
GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
|
2349
|
+
OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
|
2303
2350
|
)
|
2304
2351
|
from sklearn.compose import TransformedTargetRegressor
|
2305
2352
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
@@ -2316,15 +2363,21 @@ def predict(
|
|
2316
2363
|
)
|
2317
2364
|
from sklearn.preprocessing import PolynomialFeatures
|
2318
2365
|
from sklearn.model_selection import train_test_split
|
2319
|
-
|
2366
|
+
|
2367
|
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
2368
|
+
from sklearn.kernel_ridge import KernelRidge
|
2369
|
+
from sklearn.dummy import DummyRegressor
|
2370
|
+
from autogluon.tabular import TabularPredictor
|
2320
2371
|
# 拼写检查
|
2321
2372
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
2322
2373
|
print(f"{purpose} processing...")
|
2374
|
+
|
2375
|
+
|
2323
2376
|
# Default models or regressors if not provided
|
2324
2377
|
if purpose == "classification":
|
2325
2378
|
model_ = {
|
2326
2379
|
"Random Forest": RandomForestClassifier(
|
2327
|
-
random_state=random_state, class_weight=class_weight
|
2380
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2328
2381
|
),
|
2329
2382
|
# SVC (Support Vector Classification)
|
2330
2383
|
"SVM": SVC(
|
@@ -2335,7 +2388,7 @@ def predict(
|
|
2335
2388
|
),
|
2336
2389
|
# fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
|
2337
2390
|
"Logistic Regression": LogisticRegression(
|
2338
|
-
class_weight=class_weight, random_state=random_state
|
2391
|
+
class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
|
2339
2392
|
),
|
2340
2393
|
# Logistic Regression with L1 Regularization (Lasso)
|
2341
2394
|
"Lasso Logistic Regression": LogisticRegression(
|
@@ -2346,51 +2399,70 @@ def predict(
|
|
2346
2399
|
eval_metric="logloss",
|
2347
2400
|
random_state=random_state,
|
2348
2401
|
),
|
2349
|
-
"KNN": KNeighborsClassifier(n_neighbors=5),
|
2402
|
+
"KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
|
2350
2403
|
"Naive Bayes": GaussianNB(),
|
2351
2404
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
|
2352
2405
|
"AdaBoost": AdaBoostClassifier(
|
2353
2406
|
algorithm="SAMME", random_state=random_state
|
2354
2407
|
),
|
2355
|
-
|
2408
|
+
"LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
|
2356
2409
|
"CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
|
2357
2410
|
"Extra Trees": ExtraTreesClassifier(
|
2358
|
-
random_state=random_state, class_weight=class_weight
|
2411
|
+
random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
|
2359
2412
|
),
|
2360
|
-
"Bagging": BaggingClassifier(random_state=random_state),
|
2413
|
+
"Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
|
2361
2414
|
"Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
|
2362
2415
|
"DecisionTree": DecisionTreeClassifier(),
|
2363
2416
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
|
2364
2417
|
"Ridge": RidgeClassifierCV(
|
2365
2418
|
class_weight=class_weight, store_cv_results=True
|
2366
2419
|
),
|
2367
|
-
"Perceptron": Perceptron(random_state=random_state),
|
2420
|
+
"Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
|
2368
2421
|
"Bernoulli Naive Bayes": BernoulliNB(),
|
2369
|
-
"SGDClassifier": SGDClassifier(random_state=random_state),
|
2422
|
+
"SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
|
2370
2423
|
}
|
2371
2424
|
elif purpose == "regression":
|
2372
2425
|
model_ = {
|
2373
|
-
"Random Forest": RandomForestRegressor(random_state=random_state),
|
2426
|
+
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2374
2427
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2375
|
-
|
2376
|
-
"LassoCV": LassoCV(
|
2377
|
-
cv=cv_folds, random_state=random_state
|
2378
|
-
), # LassoCV自动找出最适alpha,优于Lasso
|
2428
|
+
"LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
|
2379
2429
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2380
|
-
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
|
2381
|
-
"Linear Regression": LinearRegression(),
|
2430
|
+
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2431
|
+
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2382
2432
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2383
|
-
|
2433
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
|
2384
2434
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2385
|
-
"Extra Trees": ExtraTreesRegressor(random_state=random_state),
|
2386
|
-
"Bagging": BaggingRegressor(random_state=random_state),
|
2435
|
+
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2436
|
+
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2387
2437
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2388
2438
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2389
|
-
"Ridge": Ridge(),
|
2390
|
-
"KNN": KNeighborsRegressor(),
|
2391
|
-
"TheilSen":TheilSenRegressor(),
|
2439
|
+
"Ridge": Ridge(random_state=random_state),
|
2440
|
+
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2441
|
+
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2392
2442
|
"Huber":HuberRegressor(),
|
2393
|
-
"Poisson":PoissonRegressor()
|
2443
|
+
"Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
|
2444
|
+
"Lasso": Lasso(random_state=random_state),
|
2445
|
+
"Lars": Lars(),
|
2446
|
+
"LassoLars": LassoLars(),
|
2447
|
+
"BayesianRidge": BayesianRidge(),
|
2448
|
+
"GammaRegressor": GammaRegressor(),
|
2449
|
+
"TweedieRegressor": TweedieRegressor(),
|
2450
|
+
"LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
|
2451
|
+
"ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
|
2452
|
+
"LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
|
2453
|
+
"LarsCV": LarsCV(),
|
2454
|
+
"OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
|
2455
|
+
"OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
|
2456
|
+
"PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
|
2457
|
+
"LinearSVR": LinearSVR(random_state=random_state),
|
2458
|
+
"NuSVR": NuSVR(),
|
2459
|
+
"DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
|
2460
|
+
"ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
|
2461
|
+
"HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
|
2462
|
+
"GaussianProcessRegressor": GaussianProcessRegressor(),
|
2463
|
+
"KernelRidge": KernelRidge(),
|
2464
|
+
"DummyRegressor": DummyRegressor(),
|
2465
|
+
"TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
|
2394
2466
|
}
|
2395
2467
|
if cls is None:
|
2396
2468
|
models = model_
|
@@ -2407,10 +2479,17 @@ def predict(
|
|
2407
2479
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2408
2480
|
)
|
2409
2481
|
|
2482
|
+
# only keep "autogluon_tab" in models
|
2483
|
+
cls = [cls] if isinstance(cls, str) else cls
|
2484
|
+
|
2485
|
+
if cls is not None:
|
2486
|
+
models={"autogluon_tab":None} if "auto" in cls else models
|
2487
|
+
|
2410
2488
|
# indicate cls:
|
2411
2489
|
if ips.run_once_within(30): # 10 min
|
2412
2490
|
print(f"processing: {list(models.keys())}")
|
2413
|
-
|
2491
|
+
y_train_col_name=None
|
2492
|
+
# print(isinstance(y_train, str) and y_train in x_train.columns)
|
2414
2493
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2415
2494
|
y_train_col_name = y_train
|
2416
2495
|
y_train = x_train[y_train]
|
@@ -2418,6 +2497,7 @@ def predict(
|
|
2418
2497
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2419
2498
|
# else:
|
2420
2499
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2500
|
+
|
2421
2501
|
y_train = pd.DataFrame(y_train)
|
2422
2502
|
if y_train.select_dtypes(include=np.number).empty:
|
2423
2503
|
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
@@ -2430,6 +2510,9 @@ def predict(
|
|
2430
2510
|
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2431
2511
|
print("is_binary:", is_binary)
|
2432
2512
|
|
2513
|
+
if fill_missing:
|
2514
|
+
ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
|
2515
|
+
ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
|
2433
2516
|
# Perform backward feature selection
|
2434
2517
|
if backward:
|
2435
2518
|
selected_features = backward_regression(x_train, y_train, thr=backward_thr)
|
@@ -2458,6 +2541,8 @@ def predict(
|
|
2458
2541
|
pd.DataFrame(y_train), method="label"
|
2459
2542
|
).values.ravel()
|
2460
2543
|
|
2544
|
+
if fill_missing:
|
2545
|
+
ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
|
2461
2546
|
if y_true is not None:
|
2462
2547
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2463
2548
|
y_true_col_name = y_true
|
@@ -2490,11 +2575,16 @@ def predict(
|
|
2490
2575
|
# Ensure common features are selected
|
2491
2576
|
if common_features is not None:
|
2492
2577
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
2578
|
+
share_col_names=common_features
|
2493
2579
|
else:
|
2494
2580
|
share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
|
2495
2581
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2496
2582
|
|
2497
|
-
|
2583
|
+
#! scaler
|
2584
|
+
# scaler and fit x_train and export scaler to fit the x_true
|
2585
|
+
x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
|
2586
|
+
#
|
2587
|
+
x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
|
2498
2588
|
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2499
2589
|
x_true, method="dummy"
|
2500
2590
|
)
|
@@ -2516,18 +2606,261 @@ def predict(
|
|
2516
2606
|
if isinstance(y_train, np.ndarray):
|
2517
2607
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2518
2608
|
y_true = np.asarray(y_true)
|
2519
|
-
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2524
|
-
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2609
|
+
#! so far, got the: x_train,x_true,y_train,y_true
|
2610
|
+
# Grid search with KFold or StratifiedKFold
|
2611
|
+
if "autogluon_tab" in models:
|
2612
|
+
# load hypoer_param
|
2613
|
+
f_param = os.path.dirname(os.path.abspath(__file__))
|
2614
|
+
f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
|
2615
|
+
with open(f_param, "r") as file:
|
2616
|
+
hyper_param_autogluon = json.load(file)
|
2617
|
+
# Train the model with AutoGluon
|
2618
|
+
features=x_train.columns.tolist()
|
2619
|
+
label= y_train_col_name if y_train_col_name is not None else 'target'
|
2620
|
+
df_autogluon = x_train.copy()
|
2621
|
+
df_autogluon[label]=y_train
|
2622
|
+
autogluon_presets=["best_quality","good_quality","fast_train"]
|
2623
|
+
best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
|
2624
|
+
train_data=df_autogluon,
|
2625
|
+
presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
|
2626
|
+
time_limit=time_limit,#3600, # in sec: Limit training time,
|
2627
|
+
num_bag_folds=num_bag_folds,
|
2628
|
+
num_stack_levels=num_stack_levels,
|
2629
|
+
hyperparameters=hyper_param_autogluon,
|
2630
|
+
verbosity=1 if verbose else 0,
|
2631
|
+
**kwargs
|
2632
|
+
)
|
2633
|
+
#! Get the leaderboard
|
2634
|
+
gs={}
|
2635
|
+
# Display the leaderboard for reference
|
2636
|
+
leaderboard = best_clf.leaderboard()
|
2637
|
+
gs['info']=best_clf.info()
|
2638
|
+
# gs["res"]=best_clf
|
2639
|
+
gs["features"]=features
|
2640
|
+
gs["leaderboard"] = leaderboard
|
2641
|
+
best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
|
2642
|
+
# Store the best model and its details in the gs dictionary
|
2643
|
+
gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
|
2644
|
+
gs["best_params_"] = best_model_name # Hyperparameters
|
2645
|
+
# Make predictions if x_true is provided
|
2646
|
+
if x_true is not None:
|
2647
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2648
|
+
gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
|
2649
|
+
gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
|
2650
|
+
x_true[label]=gs["predictions"]
|
2651
|
+
if gs["predictions"].value_counts().shape[0]>1:
|
2652
|
+
gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
|
2653
|
+
gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
|
2654
|
+
all_models = gs["models"]
|
2655
|
+
model_evaluations = {}
|
2656
|
+
for model in all_models:
|
2657
|
+
predictions = best_clf.predict(x_true[features], model=model)
|
2658
|
+
evaluation = best_clf.evaluate_predictions(
|
2659
|
+
y_true=x_true[label], # True labels
|
2660
|
+
y_pred=predictions, # Predictions from the specific model
|
2661
|
+
auxiliary_metrics=True, # Include additional metrics if needed
|
2662
|
+
)
|
2663
|
+
model_evaluations[model] = evaluation
|
2664
|
+
gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
|
2665
|
+
#! 试着保持一样的格式
|
2666
|
+
results = {}
|
2667
|
+
for model in all_models:
|
2668
|
+
y_pred = best_clf.predict(x_true[features], model=model).tolist()
|
2669
|
+
y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
|
2670
|
+
|
2671
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
2672
|
+
y_pred_proba=y_pred_proba.iloc[:,1]
|
2673
|
+
|
2674
|
+
# try to make predict format consistant
|
2675
|
+
try:
|
2676
|
+
y_pred= [i[0] for i in y_pred]
|
2677
|
+
except:
|
2678
|
+
pass
|
2679
|
+
try:
|
2680
|
+
y_true= [i[0] for i in y_true]
|
2681
|
+
except:
|
2682
|
+
pass
|
2683
|
+
try:
|
2684
|
+
y_train= [i[0] for i in y_train]
|
2685
|
+
except:
|
2686
|
+
pass
|
2687
|
+
validation_scores = {}
|
2688
|
+
if y_true is not None and y_pred_proba is not None:
|
2689
|
+
validation_scores = cal_metrics(
|
2690
|
+
y_true,
|
2691
|
+
y_pred,
|
2692
|
+
y_pred_proba=y_pred_proba,
|
2693
|
+
is_binary=is_binary,
|
2694
|
+
purpose=purpose,
|
2695
|
+
average="weighted",
|
2696
|
+
)
|
2697
|
+
if is_binary:
|
2698
|
+
# Calculate ROC curve
|
2699
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
2700
|
+
if y_pred_proba is not None:
|
2701
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2702
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2703
|
+
lower_ci, upper_ci = cal_auc_ci(
|
2704
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2705
|
+
)
|
2706
|
+
roc_auc = auc(fpr, tpr)
|
2707
|
+
roc_info = {
|
2708
|
+
"fpr": fpr.tolist(),
|
2709
|
+
"tpr": tpr.tolist(),
|
2710
|
+
"auc": roc_auc,
|
2711
|
+
"ci95": (lower_ci, upper_ci),
|
2712
|
+
}
|
2713
|
+
# precision-recall curve
|
2714
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
|
2715
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
2716
|
+
pr_info = {
|
2717
|
+
"precision": precision_,
|
2718
|
+
"recall": recall_,
|
2719
|
+
"avg_precision": avg_precision_,
|
2720
|
+
}
|
2721
|
+
else:
|
2722
|
+
roc_info, pr_info = None, None
|
2723
|
+
if purpose == "classification":
|
2724
|
+
results[model] = {
|
2725
|
+
# "best_clf": gs.best_estimator_,
|
2726
|
+
# "best_params": gs.best_params_,
|
2727
|
+
# "auc_indiv": [
|
2728
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2729
|
+
# for i in range(cv_folds)
|
2730
|
+
# ],
|
2731
|
+
"scores": validation_scores,
|
2732
|
+
"roc_curve": roc_info,
|
2733
|
+
"pr_curve": pr_info,
|
2734
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2735
|
+
"predictions": y_pred,#.tolist(),
|
2736
|
+
"predictions_proba": (
|
2737
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2738
|
+
),
|
2739
|
+
"features":features,
|
2740
|
+
# "coef":coef_,
|
2741
|
+
# "alphas":alphas_
|
2742
|
+
}
|
2743
|
+
else: # "regression"
|
2744
|
+
results[model] = {
|
2745
|
+
# "best_clf": gs.best_estimator_,
|
2746
|
+
# "best_params": gs.best_params_,
|
2747
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2748
|
+
"predictions": y_pred,#.tolist(),
|
2749
|
+
"predictions_proba": (
|
2750
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2751
|
+
),
|
2752
|
+
"features":features,
|
2753
|
+
# "coef":coef_,
|
2754
|
+
# "alphas":alphas_
|
2755
|
+
}
|
2756
|
+
else: # multi-classes
|
2757
|
+
if y_pred_proba is not None:
|
2758
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2759
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2760
|
+
confidence_intervals = cal_auc_ci(
|
2761
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2762
|
+
)
|
2763
|
+
roc_info = {
|
2764
|
+
"fpr": validation_scores["fpr"],
|
2765
|
+
"tpr": validation_scores["tpr"],
|
2766
|
+
"auc": validation_scores["roc_auc_by_class"],
|
2767
|
+
"ci95": confidence_intervals,
|
2768
|
+
}
|
2769
|
+
# precision-recall curve
|
2770
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
2771
|
+
y_true, y_pred_proba, is_binary=is_binary
|
2772
|
+
)
|
2773
|
+
pr_info = {
|
2774
|
+
"precision": precision_,
|
2775
|
+
"recall": recall_,
|
2776
|
+
"avg_precision": avg_precision_,
|
2777
|
+
}
|
2778
|
+
else:
|
2779
|
+
roc_info, pr_info = None, None
|
2780
|
+
|
2781
|
+
if purpose == "classification":
|
2782
|
+
results[model] = {
|
2783
|
+
# "best_clf": gs.best_estimator_,
|
2784
|
+
# "best_params": gs.best_params_,
|
2785
|
+
# "auc_indiv": [
|
2786
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2787
|
+
# for i in range(cv_folds)
|
2788
|
+
# ],
|
2789
|
+
"scores": validation_scores,
|
2790
|
+
"roc_curve": roc_info,
|
2791
|
+
"pr_curve": pr_info,
|
2792
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2793
|
+
"predictions": y_pred,#.tolist(),
|
2794
|
+
"predictions_proba": (
|
2795
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2796
|
+
),
|
2797
|
+
"features":features,
|
2798
|
+
# "coef":coef_,
|
2799
|
+
# "alphas":alphas_
|
2800
|
+
}
|
2801
|
+
else: # "regression"
|
2802
|
+
results[model] = {
|
2803
|
+
# "best_clf": gs.best_estimator_,
|
2804
|
+
# "best_params": gs.best_params_,
|
2805
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2806
|
+
"predictions": y_pred,#.tolist(),
|
2807
|
+
"predictions_proba": (
|
2808
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2809
|
+
),
|
2810
|
+
"features":features,
|
2811
|
+
# "coef":coef_,
|
2812
|
+
# "alphas":alphas_
|
2813
|
+
}
|
2814
|
+
|
2815
|
+
else:
|
2816
|
+
if y_true is None:
|
2817
|
+
validation_scores = []
|
2818
|
+
else:
|
2819
|
+
validation_scores = cal_metrics(
|
2820
|
+
y_true,
|
2821
|
+
y_pred,
|
2822
|
+
y_pred_proba=y_pred_proba,
|
2823
|
+
is_binary=is_binary,
|
2824
|
+
purpose=purpose,
|
2825
|
+
average="weighted",
|
2826
|
+
)
|
2827
|
+
results[model] = {
|
2828
|
+
# "best_clf": gs.best_estimator_,
|
2829
|
+
# "best_params": gs.best_params_,
|
2830
|
+
"scores": validation_scores,
|
2831
|
+
"predictions": y_pred,#.tolist(),
|
2832
|
+
"predictions_proba": (
|
2833
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2834
|
+
),
|
2835
|
+
"features":features,
|
2836
|
+
"y_train": y_train if y_train is not None else [],
|
2837
|
+
"y_true": y_true if y_true is not None else [],
|
2838
|
+
# "coef":coef_,
|
2839
|
+
# "alphas":alphas_
|
2530
2840
|
}
|
2841
|
+
df_results = pd.DataFrame.from_dict(results, orient="index")
|
2842
|
+
gs['res']=df_results
|
2843
|
+
|
2844
|
+
if all([plot_, y_true is not None, purpose == "classification"]):
|
2845
|
+
from datetime import datetime
|
2846
|
+
|
2847
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
2848
|
+
# try:
|
2849
|
+
if df_results.shape[0] > 3:
|
2850
|
+
try:
|
2851
|
+
plot_validate_features(df_results, is_binary=is_binary)
|
2852
|
+
except Exception as e:
|
2853
|
+
print(e)
|
2854
|
+
else:
|
2855
|
+
try:
|
2856
|
+
plot_validate_features_single(df_results, is_binary=is_binary)
|
2857
|
+
except Exception as e:
|
2858
|
+
print(e)
|
2859
|
+
if dir_save:
|
2860
|
+
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
2861
|
+
return gs
|
2862
|
+
|
2863
|
+
#! cross_valid
|
2531
2864
|
if cv_level in ["low", "simple", "s", "l"]:
|
2532
2865
|
param_grids = {
|
2533
2866
|
"Random Forest": (
|
@@ -2696,7 +3029,73 @@ def predict(
|
|
2696
3029
|
'alpha': [0.1],
|
2697
3030
|
'max_iter': [100],},
|
2698
3031
|
"Poisson":{'alpha': [0.1],
|
2699
|
-
'max_iter': [100],}
|
3032
|
+
'max_iter': [100],},
|
3033
|
+
"Lars": {"n_nonzero_coefs": [10, 50, None]},
|
3034
|
+
"LassoLars": {
|
3035
|
+
"alpha": [0.01, 0.1, 1]
|
3036
|
+
},
|
3037
|
+
"BayesianRidge": {
|
3038
|
+
"alpha_1": [1e-6, 1e-4, 1e-2],
|
3039
|
+
"lambda_1": [1e-6, 1e-4, 1e-2]
|
3040
|
+
},
|
3041
|
+
"GammaRegressor": {
|
3042
|
+
"alpha": [0.1, 1, 10]
|
3043
|
+
},
|
3044
|
+
"TweedieRegressor": {
|
3045
|
+
"alpha": [0.1, 1, 10],
|
3046
|
+
"power": [1, 1.5, 2]
|
3047
|
+
},
|
3048
|
+
"LassoCV": {
|
3049
|
+
"cv": [5]
|
3050
|
+
},
|
3051
|
+
"ElasticNetCV": {
|
3052
|
+
"l1_ratio": [0.2, 0.5, 0.8],
|
3053
|
+
"cv": [5]
|
3054
|
+
},
|
3055
|
+
"LassoLarsCV": {
|
3056
|
+
"cv": [5]
|
3057
|
+
},
|
3058
|
+
"LarsCV": {
|
3059
|
+
"cv": [5]
|
3060
|
+
},
|
3061
|
+
"OrthogonalMatchingPursuit": {
|
3062
|
+
"n_nonzero_coefs": [10, 50, None]
|
3063
|
+
},
|
3064
|
+
"OrthogonalMatchingPursuitCV": {
|
3065
|
+
"cv": [5]
|
3066
|
+
},
|
3067
|
+
"PassiveAggressiveRegressor": {
|
3068
|
+
"C": [0.1, 1, 10]
|
3069
|
+
},
|
3070
|
+
"LinearSVR": {
|
3071
|
+
"C": [0.1, 1, 10]
|
3072
|
+
},
|
3073
|
+
"NuSVR": {
|
3074
|
+
"C": [0.1, 1, 10]
|
3075
|
+
},
|
3076
|
+
"DecisionTreeRegressor": {
|
3077
|
+
"max_depth": [5, 10, None]
|
3078
|
+
},
|
3079
|
+
"ExtraTreeRegressor": {
|
3080
|
+
"max_depth": [5, 10, None]
|
3081
|
+
},
|
3082
|
+
"HistGradientBoostingRegressor": {
|
3083
|
+
"learning_rate": [0.05, 0.1, 0.2],
|
3084
|
+
"max_depth": [5, 10, None]
|
3085
|
+
},
|
3086
|
+
"GaussianProcessRegressor": {
|
3087
|
+
"alpha": [1e-5, 1e-2, 0.1]
|
3088
|
+
},
|
3089
|
+
"KernelRidge": {
|
3090
|
+
"alpha": [0.1, 1, 10],
|
3091
|
+
"kernel": ["linear", "rbf"]
|
3092
|
+
},
|
3093
|
+
"DummyRegressor": {
|
3094
|
+
"strategy": ["mean", "median"]
|
3095
|
+
},
|
3096
|
+
"TransformedTargetRegressor": {
|
3097
|
+
"regressor__fit_intercept": [True, False]
|
3098
|
+
}
|
2700
3099
|
}
|
2701
3100
|
elif cv_level in ["high", "advanced", "h"]:
|
2702
3101
|
param_grids = {
|
@@ -2901,7 +3300,96 @@ def predict(
|
|
2901
3300
|
'alpha': [0.1, 1.0, 10.0],
|
2902
3301
|
'max_iter': [100, 200, 300],},
|
2903
3302
|
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2904
|
-
'max_iter': [100, 200, 300],}
|
3303
|
+
'max_iter': [100, 200, 300],},
|
3304
|
+
"Lars": {
|
3305
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3306
|
+
},
|
3307
|
+
"LassoLars": {
|
3308
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3309
|
+
},
|
3310
|
+
"BayesianRidge": {
|
3311
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3312
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3313
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3314
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3315
|
+
},
|
3316
|
+
"GammaRegressor": {
|
3317
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3318
|
+
"max_iter": [1000, 5000, 10000]
|
3319
|
+
},
|
3320
|
+
"TweedieRegressor": {
|
3321
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3322
|
+
"power": [0, 1, 1.5, 2, 3]
|
3323
|
+
},
|
3324
|
+
"LassoCV": {
|
3325
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3326
|
+
"cv": [3, 5, 10]
|
3327
|
+
},
|
3328
|
+
"ElasticNetCV": {
|
3329
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3330
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3331
|
+
"cv": [3, 5, 10]
|
3332
|
+
},
|
3333
|
+
"LassoLarsCV": {
|
3334
|
+
"cv": [3, 5, 10]
|
3335
|
+
},
|
3336
|
+
"LarsCV": {
|
3337
|
+
"cv": [3, 5, 10]
|
3338
|
+
},
|
3339
|
+
"OrthogonalMatchingPursuit": {
|
3340
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3341
|
+
},
|
3342
|
+
"OrthogonalMatchingPursuitCV": {
|
3343
|
+
"cv": [3, 5, 10]
|
3344
|
+
},
|
3345
|
+
"PassiveAggressiveRegressor": {
|
3346
|
+
"C": [0.01, 0.1, 1, 10],
|
3347
|
+
"max_iter": [1000, 5000, 10000],
|
3348
|
+
"early_stopping": [True, False]
|
3349
|
+
},
|
3350
|
+
"LinearSVR": {
|
3351
|
+
"C": [0.01, 0.1, 1, 10],
|
3352
|
+
"epsilon": [0.01, 0.1, 1],
|
3353
|
+
"max_iter": [1000, 5000, 10000]
|
3354
|
+
},
|
3355
|
+
"NuSVR": {
|
3356
|
+
"C": [0.01, 0.1, 1, 10],
|
3357
|
+
"nu": [0.25, 0.5, 0.75],
|
3358
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3359
|
+
},
|
3360
|
+
"DecisionTreeRegressor": {
|
3361
|
+
"max_depth": [None, 5, 10, 20],
|
3362
|
+
"min_samples_split": [2, 5, 10],
|
3363
|
+
"min_samples_leaf": [1, 2, 4]
|
3364
|
+
},
|
3365
|
+
"ExtraTreeRegressor": {
|
3366
|
+
"max_depth": [None, 5, 10, 20],
|
3367
|
+
"min_samples_split": [2, 5, 10],
|
3368
|
+
"min_samples_leaf": [1, 2, 4]
|
3369
|
+
},
|
3370
|
+
"HistGradientBoostingRegressor": {
|
3371
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3372
|
+
"max_iter": [100, 500, 1000],
|
3373
|
+
"max_depth": [None, 5, 10, 20],
|
3374
|
+
"min_samples_leaf": [1, 2, 4]
|
3375
|
+
},
|
3376
|
+
"GaussianProcessRegressor": {
|
3377
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3378
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3379
|
+
},
|
3380
|
+
"KernelRidge": {
|
3381
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3382
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3383
|
+
"degree": [2, 3, 4]
|
3384
|
+
},
|
3385
|
+
"DummyRegressor": {
|
3386
|
+
"strategy": ["mean", "median", "constant"],
|
3387
|
+
"constant": [0] # Only if strategy is 'constant'
|
3388
|
+
},
|
3389
|
+
"TransformedTargetRegressor": {
|
3390
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3391
|
+
"regressor__fit_intercept": [True, False]
|
3392
|
+
}
|
2905
3393
|
}
|
2906
3394
|
else: # median level
|
2907
3395
|
param_grids = {
|
@@ -3148,7 +3636,96 @@ def predict(
|
|
3148
3636
|
'alpha': [0.1, 1.0],
|
3149
3637
|
'max_iter': [100, 200],},
|
3150
3638
|
"Poisson":{'alpha': [0.1, 1.0],
|
3151
|
-
'max_iter': [100, 200],}
|
3639
|
+
'max_iter': [100, 200],},
|
3640
|
+
"Lars": {
|
3641
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3642
|
+
},
|
3643
|
+
"LassoLars": {
|
3644
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3645
|
+
},
|
3646
|
+
"BayesianRidge": {
|
3647
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3648
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3649
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3650
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3651
|
+
},
|
3652
|
+
"GammaRegressor": {
|
3653
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3654
|
+
"max_iter": [1000, 5000, 10000]
|
3655
|
+
},
|
3656
|
+
"TweedieRegressor": {
|
3657
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3658
|
+
"power": [0, 1, 1.5, 2, 3]
|
3659
|
+
},
|
3660
|
+
"LassoCV": {
|
3661
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3662
|
+
"cv": [3, 5, 10]
|
3663
|
+
},
|
3664
|
+
"ElasticNetCV": {
|
3665
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3666
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3667
|
+
"cv": [3, 5, 10]
|
3668
|
+
},
|
3669
|
+
"LassoLarsCV": {
|
3670
|
+
"cv": [3, 5, 10]
|
3671
|
+
},
|
3672
|
+
"LarsCV": {
|
3673
|
+
"cv": [3, 5, 10]
|
3674
|
+
},
|
3675
|
+
"OrthogonalMatchingPursuit": {
|
3676
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3677
|
+
},
|
3678
|
+
"OrthogonalMatchingPursuitCV": {
|
3679
|
+
"cv": [3, 5, 10]
|
3680
|
+
},
|
3681
|
+
"PassiveAggressiveRegressor": {
|
3682
|
+
"C": [0.01, 0.1, 1, 10],
|
3683
|
+
"max_iter": [1000, 5000, 10000],
|
3684
|
+
"early_stopping": [True, False]
|
3685
|
+
},
|
3686
|
+
"LinearSVR": {
|
3687
|
+
"C": [0.01, 0.1, 1, 10],
|
3688
|
+
"epsilon": [0.01, 0.1, 1],
|
3689
|
+
"max_iter": [1000, 5000, 10000]
|
3690
|
+
},
|
3691
|
+
"NuSVR": {
|
3692
|
+
"C": [0.01, 0.1, 1, 10],
|
3693
|
+
"nu": [0.25, 0.5, 0.75],
|
3694
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3695
|
+
},
|
3696
|
+
"DecisionTreeRegressor": {
|
3697
|
+
"max_depth": [None, 5, 10, 20],
|
3698
|
+
"min_samples_split": [2, 5, 10],
|
3699
|
+
"min_samples_leaf": [1, 2, 4]
|
3700
|
+
},
|
3701
|
+
"ExtraTreeRegressor": {
|
3702
|
+
"max_depth": [None, 5, 10, 20],
|
3703
|
+
"min_samples_split": [2, 5, 10],
|
3704
|
+
"min_samples_leaf": [1, 2, 4]
|
3705
|
+
},
|
3706
|
+
"HistGradientBoostingRegressor": {
|
3707
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3708
|
+
"max_iter": [100, 500, 1000],
|
3709
|
+
"max_depth": [None, 5, 10, 20],
|
3710
|
+
"min_samples_leaf": [1, 2, 4]
|
3711
|
+
},
|
3712
|
+
"GaussianProcessRegressor": {
|
3713
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3714
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3715
|
+
},
|
3716
|
+
"KernelRidge": {
|
3717
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3718
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3719
|
+
"degree": [2, 3, 4]
|
3720
|
+
},
|
3721
|
+
"DummyRegressor": {
|
3722
|
+
"strategy": ["mean", "median", "constant"],
|
3723
|
+
"constant": [0] # Only if strategy is 'constant'
|
3724
|
+
},
|
3725
|
+
"TransformedTargetRegressor": {
|
3726
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3727
|
+
"regressor__fit_intercept": [True, False]
|
3728
|
+
}
|
3152
3729
|
}
|
3153
3730
|
|
3154
3731
|
results = {}
|
@@ -3158,7 +3735,7 @@ def predict(
|
|
3158
3735
|
if purpose == "classification"
|
3159
3736
|
else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
3160
3737
|
)
|
3161
|
-
|
3738
|
+
|
3162
3739
|
# Train and validate each model
|
3163
3740
|
for name, clf in tqdm(
|
3164
3741
|
models.items(),
|
@@ -3168,83 +3745,132 @@ def predict(
|
|
3168
3745
|
):
|
3169
3746
|
if verbose:
|
3170
3747
|
print(f"\nTraining and validating {name}:")
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
verbose=verbose,
|
3185
|
-
)
|
3186
|
-
|
3187
|
-
gs.fit(x_train, y_train)
|
3188
|
-
best_clf = gs.best_estimator_
|
3189
|
-
# make sure x_train and x_test has the same name
|
3190
|
-
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3191
|
-
y_pred = best_clf.predict(x_true)
|
3192
|
-
if hasattr(best_clf, "predict_proba"):
|
3193
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
3194
|
-
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3195
|
-
if y_pred_proba.shape[1] == 1:
|
3196
|
-
y_pred_proba = np.hstack(
|
3197
|
-
[1 - y_pred_proba, y_pred_proba]
|
3198
|
-
) # Add missing class probabilities
|
3199
|
-
y_pred_proba = y_pred_proba[:, 1]
|
3200
|
-
elif hasattr(best_clf, "decision_function"):
|
3201
|
-
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3202
|
-
y_pred_proba = best_clf.decision_function(x_true)
|
3203
|
-
# Ensure y_pred_proba is within 0 and 1 bounds
|
3204
|
-
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3205
|
-
y_pred_proba.max() - y_pred_proba.min()
|
3748
|
+
try:
|
3749
|
+
if is_binary:
|
3750
|
+
gs = GridSearchCV(
|
3751
|
+
clf,
|
3752
|
+
param_grid=param_grids.get(name, {}),
|
3753
|
+
scoring=(
|
3754
|
+
"roc_auc"
|
3755
|
+
if purpose == "classification"
|
3756
|
+
else "neg_mean_squared_error"
|
3757
|
+
),
|
3758
|
+
cv=cv,
|
3759
|
+
n_jobs=n_jobs,
|
3760
|
+
verbose=verbose,
|
3206
3761
|
)
|
3207
|
-
else:
|
3208
|
-
y_pred_proba = None # No probability output for certain models
|
3209
|
-
else:
|
3210
|
-
gs = GridSearchCV(
|
3211
|
-
clf,
|
3212
|
-
param_grid=param_grids.get(name, {}),
|
3213
|
-
scoring=(
|
3214
|
-
"roc_auc_ovr"
|
3215
|
-
if purpose == "classification"
|
3216
|
-
else "neg_mean_squared_error"
|
3217
|
-
),
|
3218
|
-
cv=cv,
|
3219
|
-
n_jobs=n_jobs,
|
3220
|
-
verbose=verbose,
|
3221
|
-
)
|
3222
3762
|
|
3223
|
-
|
3224
|
-
|
3225
|
-
|
3226
|
-
|
3227
|
-
|
3228
|
-
|
3229
|
-
|
3230
|
-
|
3231
|
-
|
3232
|
-
|
3233
|
-
|
3234
|
-
|
3235
|
-
|
3236
|
-
|
3237
|
-
|
3238
|
-
|
3239
|
-
|
3240
|
-
|
3241
|
-
|
3242
|
-
y_pred_proba
|
3243
|
-
|
3763
|
+
gs.fit(x_train, y_train)
|
3764
|
+
best_clf = gs.best_estimator_
|
3765
|
+
|
3766
|
+
# make sure x_train and x_test has the same name
|
3767
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3768
|
+
y_pred = best_clf.predict(x_true)
|
3769
|
+
if hasattr(best_clf, "predict_proba"):
|
3770
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3771
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
3772
|
+
if y_pred_proba.shape[1] == 1:
|
3773
|
+
y_pred_proba = np.hstack(
|
3774
|
+
[1 - y_pred_proba, y_pred_proba]
|
3775
|
+
) # Add missing class probabilities
|
3776
|
+
if y_pred_proba.shape[1] == 2:
|
3777
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
3778
|
+
y_pred_proba = y_pred_proba.iloc[:, 1]
|
3779
|
+
elif isinstance(y_pred_proba, pd.Series):
|
3780
|
+
y_pred_proba = y_pred_proba.values[:, 1]
|
3781
|
+
else:
|
3782
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3783
|
+
|
3784
|
+
elif hasattr(best_clf, "decision_function"):
|
3785
|
+
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3786
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3787
|
+
# Ensure y_pred_proba is within 0 and 1 bounds
|
3788
|
+
y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
|
3789
|
+
y_pred_proba.max() - y_pred_proba.min()
|
3244
3790
|
)
|
3245
|
-
|
3246
|
-
|
3791
|
+
else:
|
3792
|
+
y_pred_proba = None # No probability output for certain models
|
3793
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3794
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3795
|
+
if hasattr(best_clf, "alphas_"):
|
3796
|
+
alphas_ = best_clf.alphas_
|
3797
|
+
elif hasattr(best_clf, "alpha_"):
|
3798
|
+
alphas_ = best_clf.alpha_
|
3799
|
+
elif hasattr(best_clf, "Cs_"):
|
3800
|
+
alphas_ = best_clf.Cs_
|
3801
|
+
else:
|
3802
|
+
alphas_= None
|
3803
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3804
|
+
else:
|
3805
|
+
gs = GridSearchCV(
|
3806
|
+
clf,
|
3807
|
+
param_grid=param_grids.get(name, {}),
|
3808
|
+
scoring=(
|
3809
|
+
"roc_auc_ovr"
|
3810
|
+
if purpose == "classification"
|
3811
|
+
else "neg_mean_squared_error"
|
3812
|
+
),
|
3813
|
+
cv=cv,
|
3814
|
+
n_jobs=n_jobs,
|
3815
|
+
verbose=verbose,
|
3816
|
+
)
|
3247
3817
|
|
3818
|
+
# Fit GridSearchCV
|
3819
|
+
gs.fit(x_train, y_train)
|
3820
|
+
best_clf = gs.best_estimator_
|
3821
|
+
|
3822
|
+
# Ensure x_true aligns with x_train columns
|
3823
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3824
|
+
|
3825
|
+
# do i need to fit the x_train, y_train again?
|
3826
|
+
best_clf=best_clf.fit(x_train, y_train)
|
3827
|
+
y_pred = best_clf.predict(x_true)
|
3828
|
+
|
3829
|
+
# Handle prediction probabilities for multiclass
|
3830
|
+
if hasattr(best_clf, "predict_proba"):
|
3831
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
3832
|
+
elif hasattr(best_clf, "decision_function"):
|
3833
|
+
y_pred_proba = best_clf.decision_function(x_true)
|
3834
|
+
|
3835
|
+
# Normalize for multiclass if necessary
|
3836
|
+
if y_pred_proba.ndim == 2:
|
3837
|
+
y_pred_proba = (
|
3838
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3839
|
+
) / (
|
3840
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3841
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3842
|
+
)
|
3843
|
+
else:
|
3844
|
+
y_pred_proba = None # No probability output for certain models
|
3845
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
3846
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
3847
|
+
if hasattr(best_clf, "alphas_"):
|
3848
|
+
alphas_ = best_clf.alphas_
|
3849
|
+
elif hasattr(best_clf, "alpha_"):
|
3850
|
+
alphas_ = best_clf.alpha_
|
3851
|
+
elif hasattr(best_clf, "Cs_"):
|
3852
|
+
alphas_ = best_clf.Cs_
|
3853
|
+
else:
|
3854
|
+
alphas_= None
|
3855
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3856
|
+
except Exception as e:
|
3857
|
+
alphas_,coef_ = None,None
|
3858
|
+
print(f"skiped {clf}: {e}")
|
3859
|
+
continue
|
3860
|
+
|
3861
|
+
# try to make predict format consistant
|
3862
|
+
try:
|
3863
|
+
y_pred= [i[0] for i in y_pred]
|
3864
|
+
except:
|
3865
|
+
pass
|
3866
|
+
try:
|
3867
|
+
y_true= [i[0] for i in y_true]
|
3868
|
+
except:
|
3869
|
+
pass
|
3870
|
+
try:
|
3871
|
+
y_train= [i[0] for i in y_train]
|
3872
|
+
except:
|
3873
|
+
pass
|
3248
3874
|
validation_scores = {}
|
3249
3875
|
|
3250
3876
|
if y_true is not None and y_pred_proba is not None:
|
@@ -3294,20 +3920,26 @@ def predict(
|
|
3294
3920
|
"roc_curve": roc_info,
|
3295
3921
|
"pr_curve": pr_info,
|
3296
3922
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3297
|
-
"predictions": y_pred
|
3923
|
+
"predictions": y_pred,#.tolist(),
|
3298
3924
|
"predictions_proba": (
|
3299
3925
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3300
3926
|
),
|
3927
|
+
"features":share_col_names,
|
3928
|
+
"coef":coef_,
|
3929
|
+
"alphas":alphas_
|
3301
3930
|
}
|
3302
3931
|
else: # "regression"
|
3303
3932
|
results[name] = {
|
3304
3933
|
"best_clf": gs.best_estimator_,
|
3305
3934
|
"best_params": gs.best_params_,
|
3306
3935
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3307
|
-
"predictions": y_pred
|
3936
|
+
"predictions": y_pred,#.tolist(),
|
3308
3937
|
"predictions_proba": (
|
3309
3938
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3310
3939
|
),
|
3940
|
+
"features":share_col_names,
|
3941
|
+
"coef":coef_,
|
3942
|
+
"alphas":alphas_
|
3311
3943
|
}
|
3312
3944
|
else: # multi-classes
|
3313
3945
|
if y_pred_proba is not None:
|
@@ -3346,20 +3978,26 @@ def predict(
|
|
3346
3978
|
"roc_curve": roc_info,
|
3347
3979
|
"pr_curve": pr_info,
|
3348
3980
|
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
3349
|
-
"predictions": y_pred
|
3981
|
+
"predictions": y_pred,#.tolist(),
|
3350
3982
|
"predictions_proba": (
|
3351
3983
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3352
3984
|
),
|
3985
|
+
"features":share_col_names,
|
3986
|
+
"coef":coef_,
|
3987
|
+
"alphas":alphas_
|
3353
3988
|
}
|
3354
3989
|
else: # "regression"
|
3355
3990
|
results[name] = {
|
3356
3991
|
"best_clf": gs.best_estimator_,
|
3357
3992
|
"best_params": gs.best_params_,
|
3358
3993
|
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
3359
|
-
"predictions": y_pred
|
3994
|
+
"predictions": y_pred,#.tolist(),
|
3360
3995
|
"predictions_proba": (
|
3361
3996
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3362
3997
|
),
|
3998
|
+
"features":share_col_names,
|
3999
|
+
"coef":coef_,
|
4000
|
+
"alphas":alphas_
|
3363
4001
|
}
|
3364
4002
|
|
3365
4003
|
else:
|
@@ -3378,17 +4016,21 @@ def predict(
|
|
3378
4016
|
"best_clf": gs.best_estimator_,
|
3379
4017
|
"best_params": gs.best_params_,
|
3380
4018
|
"scores": validation_scores,
|
3381
|
-
"predictions": y_pred
|
4019
|
+
"predictions": y_pred,#.tolist(),
|
3382
4020
|
"predictions_proba": (
|
3383
4021
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3384
4022
|
),
|
4023
|
+
"features":share_col_names,
|
3385
4024
|
"y_train": y_train if y_train is not None else [],
|
3386
4025
|
"y_true": y_true if y_true is not None else [],
|
4026
|
+
"coef":coef_,
|
4027
|
+
"alphas":alphas_
|
3387
4028
|
}
|
3388
4029
|
|
3389
4030
|
# Convert results to DataFrame
|
3390
4031
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3391
|
-
|
4032
|
+
display(df_results)
|
4033
|
+
# sort
|
3392
4034
|
if y_true is not None:
|
3393
4035
|
if purpose == "classification":
|
3394
4036
|
df_scores = pd.DataFrame(
|
@@ -3446,7 +4088,7 @@ def predict(
|
|
3446
4088
|
for i, j in top_models.to_dict().items():
|
3447
4089
|
base_estimators.append((i, j))
|
3448
4090
|
if stacking_cv:
|
3449
|
-
print(f"
|
4091
|
+
print(f"⤵ stacking_cv is processing...")
|
3450
4092
|
#* 定义几个象征性的final_estimator
|
3451
4093
|
# 备选的几种
|
3452
4094
|
if purpose == "classification":
|
@@ -3520,7 +4162,7 @@ def predict(
|
|
3520
4162
|
best_final_estimator = cv_results_df.iloc[0]['final_estimator']
|
3521
4163
|
print(f"Best final estimator based on cross-validation: {best_final_estimator}")
|
3522
4164
|
else:
|
3523
|
-
print(f"
|
4165
|
+
print(f"⤵ trying to find the best_final_estimator for stacking...")
|
3524
4166
|
if purpose=="classification":
|
3525
4167
|
best_final_estimator = LogisticRegression(class_weight=class_weight,
|
3526
4168
|
random_state=random_state,
|
@@ -3530,26 +4172,25 @@ def predict(
|
|
3530
4172
|
print(f"⤵ the best best_final_estimator: {best_final_estimator}")
|
3531
4173
|
#! apply stacking
|
3532
4174
|
if purpose == "classification":
|
3533
|
-
print(f"
|
4175
|
+
print(f"⤵ StackingClassifier...")
|
3534
4176
|
stacking_model = StackingClassifier(estimators=base_estimators,
|
3535
4177
|
final_estimator=best_final_estimator,
|
3536
4178
|
cv=cv)
|
3537
4179
|
else:
|
3538
|
-
print(f"
|
4180
|
+
print(f"⤵ StackingRegressor...")
|
3539
4181
|
stacking_model = StackingRegressor(estimators=base_estimators,
|
3540
4182
|
final_estimator=best_final_estimator,
|
3541
4183
|
cv=cv)
|
3542
4184
|
|
3543
4185
|
# Train the Stacking Classifier
|
3544
|
-
print(f"
|
4186
|
+
print(f"⤵ fit & predict...")
|
3545
4187
|
stacking_model.fit(x_train, y_train)
|
3546
4188
|
y_pred_final = stacking_model.predict(x_true)
|
3547
|
-
print(f"
|
4189
|
+
print(f"⤵ collecting results...")
|
3548
4190
|
# pred_proba
|
3549
4191
|
if is_binary:
|
3550
4192
|
if hasattr(stacking_model, "predict_proba"):
|
3551
4193
|
y_pred_proba_final = stacking_model.predict_proba(x_true)
|
3552
|
-
print("Shape of predicted probabilities:", y_pred_proba_final.shape)
|
3553
4194
|
if y_pred_proba_final.shape[1] == 1:
|
3554
4195
|
y_pred_proba_final = np.hstack(
|
3555
4196
|
[1 - y_pred_proba_final, y_pred_proba_final]
|
@@ -3564,6 +4205,17 @@ def predict(
|
|
3564
4205
|
)
|
3565
4206
|
else:
|
3566
4207
|
y_pred_proba_final = None # No probability output for certain models
|
4208
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
4209
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
4210
|
+
if hasattr(best_clf, "alphas_"):
|
4211
|
+
alphas_ = best_clf.alphas_
|
4212
|
+
elif hasattr(best_clf, "alpha_"):
|
4213
|
+
alphas_ = best_clf.alpha_
|
4214
|
+
elif hasattr(best_clf, "Cs_"):
|
4215
|
+
alphas_ = best_clf.Cs_
|
4216
|
+
else:
|
4217
|
+
alphas_= None
|
4218
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3567
4219
|
if not is_binary:
|
3568
4220
|
# Handle prediction probabilities for multiclass
|
3569
4221
|
if hasattr(stacking_model, "predict_proba"):
|
@@ -3581,6 +4233,17 @@ def predict(
|
|
3581
4233
|
)
|
3582
4234
|
else:
|
3583
4235
|
y_pred_proba_final = None # No probability output for certain models
|
4236
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
4237
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
4238
|
+
if hasattr(best_clf, "alphas_"):
|
4239
|
+
alphas_ = best_clf.alphas_
|
4240
|
+
elif hasattr(best_clf, "alpha_"):
|
4241
|
+
alphas_ = best_clf.alpha_
|
4242
|
+
elif hasattr(best_clf, "Cs_"):
|
4243
|
+
alphas_ = best_clf.Cs_
|
4244
|
+
else:
|
4245
|
+
alphas_= None
|
4246
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3584
4247
|
#! dict_pred_stack
|
3585
4248
|
dict_pred_stack={}
|
3586
4249
|
validation_scores_final = {}
|
@@ -3631,6 +4294,9 @@ def predict(
|
|
3631
4294
|
"predictions_proba": (
|
3632
4295
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3633
4296
|
),
|
4297
|
+
"features":share_col_names,
|
4298
|
+
"coef":coef_,
|
4299
|
+
"alphas":alphas_
|
3634
4300
|
}
|
3635
4301
|
else: # "regression"
|
3636
4302
|
dict_pred_stack = {
|
@@ -3641,6 +4307,9 @@ def predict(
|
|
3641
4307
|
"predictions_proba": (
|
3642
4308
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3643
4309
|
),
|
4310
|
+
"features":share_col_names,
|
4311
|
+
"coef":coef_,
|
4312
|
+
"alphas":alphas_
|
3644
4313
|
}
|
3645
4314
|
else: # multi-classes
|
3646
4315
|
if y_pred_proba_final is not None:
|
@@ -3680,6 +4349,9 @@ def predict(
|
|
3680
4349
|
"predictions_proba": (
|
3681
4350
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3682
4351
|
),
|
4352
|
+
"features":share_col_names,
|
4353
|
+
"coef":coef_,
|
4354
|
+
"alphas":alphas_
|
3683
4355
|
}
|
3684
4356
|
else: # "regression"
|
3685
4357
|
dict_pred_stack = {
|
@@ -3690,6 +4362,9 @@ def predict(
|
|
3690
4362
|
"predictions_proba": (
|
3691
4363
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3692
4364
|
),
|
4365
|
+
"features":share_col_names,
|
4366
|
+
"coef":coef_,
|
4367
|
+
"alphas":alphas_
|
3693
4368
|
}
|
3694
4369
|
|
3695
4370
|
else:
|
@@ -3712,8 +4387,11 @@ def predict(
|
|
3712
4387
|
"predictions_proba": (
|
3713
4388
|
y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
|
3714
4389
|
),
|
4390
|
+
"features":share_col_names,
|
3715
4391
|
"y_train": y_train if y_train is not None else [],
|
3716
4392
|
"y_true": y_true if y_true is not None else [],
|
4393
|
+
"coef":coef_,
|
4394
|
+
"alphas":alphas_
|
3717
4395
|
}
|
3718
4396
|
# merge together
|
3719
4397
|
df_pred = pd.DataFrame(
|
@@ -3728,16 +4406,16 @@ def predict(
|
|
3728
4406
|
# if dir_save:
|
3729
4407
|
# ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
|
3730
4408
|
if vote:
|
3731
|
-
print(f"
|
4409
|
+
print(f"⤵ voting...")
|
3732
4410
|
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
3733
|
-
#!
|
4411
|
+
#! voting
|
3734
4412
|
n_top_models = min(n_top_models, df_results.shape[0])
|
3735
4413
|
base_estimators=[]
|
3736
4414
|
for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
|
3737
4415
|
base_estimators.append((name,cls))
|
3738
4416
|
# Apply Voting Classifier/Regressor
|
3739
4417
|
if purpose == "classification":
|
3740
|
-
print(f"
|
4418
|
+
print(f"⤵ VotingClassifier...via{voting}")
|
3741
4419
|
if voting=='hard':
|
3742
4420
|
# Hard voting does not support `predict_proba`
|
3743
4421
|
voting_model = VotingClassifier(estimators=base_estimators)
|
@@ -3745,7 +4423,7 @@ def predict(
|
|
3745
4423
|
# Soft voting supports `predict_proba`
|
3746
4424
|
voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
|
3747
4425
|
else:
|
3748
|
-
print(f"
|
4426
|
+
print(f"⤵ VotingRegressor...")
|
3749
4427
|
voting_model = VotingRegressor(estimators=base_estimators)
|
3750
4428
|
|
3751
4429
|
# Train the Voting Classifier/Regressor
|
@@ -3770,10 +4448,23 @@ def predict(
|
|
3770
4448
|
y_pred_proba_vote = y_pred_proba_vote[:, 1]
|
3771
4449
|
else:
|
3772
4450
|
y_pred_proba_vote = None
|
4451
|
+
|
4452
|
+
# Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
|
4453
|
+
if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
|
4454
|
+
if hasattr(best_clf, "alphas_"):
|
4455
|
+
alphas_ = best_clf.alphas_
|
4456
|
+
elif hasattr(best_clf, "alpha_"):
|
4457
|
+
alphas_ = best_clf.alpha_
|
4458
|
+
elif hasattr(best_clf, "Cs_"):
|
4459
|
+
alphas_ = best_clf.Cs_
|
4460
|
+
else:
|
4461
|
+
alphas_= None
|
4462
|
+
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3773
4463
|
else: # Regression
|
3774
4464
|
y_pred_proba_vote = None
|
4465
|
+
coef_,alphas_=None,None
|
3775
4466
|
|
3776
|
-
print(f"
|
4467
|
+
print(f"⤵ collecting voting results...")
|
3777
4468
|
#! dict_pred_vote
|
3778
4469
|
dict_pred_vote = {}
|
3779
4470
|
validation_scores_vote = {}
|
@@ -3822,6 +4513,9 @@ def predict(
|
|
3822
4513
|
"predictions_proba": (
|
3823
4514
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3824
4515
|
),
|
4516
|
+
"features":share_col_names,
|
4517
|
+
"coef":coef_,
|
4518
|
+
"alphas":alphas_
|
3825
4519
|
}
|
3826
4520
|
else: # Multi-class
|
3827
4521
|
if y_pred_proba_vote is not None:
|
@@ -3856,6 +4550,9 @@ def predict(
|
|
3856
4550
|
"predictions_proba": (
|
3857
4551
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3858
4552
|
),
|
4553
|
+
"features":share_col_names,
|
4554
|
+
"coef":coef_,
|
4555
|
+
"alphas":alphas_
|
3859
4556
|
}
|
3860
4557
|
else:
|
3861
4558
|
if y_true is None:
|
@@ -3877,6 +4574,7 @@ def predict(
|
|
3877
4574
|
"predictions_proba": (
|
3878
4575
|
y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
|
3879
4576
|
),
|
4577
|
+
"features":share_col_names,
|
3880
4578
|
"y_train": y_train if y_train is not None else [],
|
3881
4579
|
"y_true": y_true if y_true is not None else [],
|
3882
4580
|
}
|
@@ -3900,6 +4598,8 @@ def predict(
|
|
3900
4598
|
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
3901
4599
|
elif stack:
|
3902
4600
|
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
4601
|
+
else:
|
4602
|
+
df_res=df_results
|
3903
4603
|
|
3904
4604
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3905
4605
|
from datetime import datetime
|
@@ -3907,9 +4607,15 @@ def predict(
|
|
3907
4607
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
3908
4608
|
# try:
|
3909
4609
|
if df_res.shape[0] > 3:
|
3910
|
-
|
4610
|
+
try:
|
4611
|
+
plot_validate_features(df_res, is_binary=is_binary)
|
4612
|
+
except Exception as e:
|
4613
|
+
print(e)
|
3911
4614
|
else:
|
3912
|
-
|
4615
|
+
try:
|
4616
|
+
plot_validate_features_single(df_res, is_binary=is_binary)
|
4617
|
+
except Exception as e:
|
4618
|
+
print(e)
|
3913
4619
|
if dir_save:
|
3914
4620
|
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
3915
4621
|
# except Exception as e:
|