py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ips.py +1059 -114
- py2ls/ml2ls.py +758 -186
- py2ls/netfinder.py +204 -20
- py2ls/ocr.py +60 -4
- py2ls/plot.py +916 -141
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/METADATA +6 -1
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/RECORD +16 -14
- py2ls/data/usages_pd copy.json +0 -1105
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
|
|
31
31
|
average_precision_score,
|
32
32
|
)
|
33
33
|
from typing import Dict, Any, Optional, List, Union
|
34
|
+
import os, json
|
34
35
|
import numpy as np
|
35
36
|
import pandas as pd
|
36
37
|
from . import ips
|
@@ -49,7 +50,13 @@ logger = logging.getLogger()
|
|
49
50
|
warnings.filterwarnings("ignore", category=UserWarning)
|
50
51
|
from sklearn.tree import DecisionTreeClassifier
|
51
52
|
from sklearn.neighbors import KNeighborsClassifier
|
52
|
-
|
53
|
+
#* set random_state global
|
54
|
+
import torch
|
55
|
+
import random
|
56
|
+
random_state=1
|
57
|
+
random.seed(random_state)
|
58
|
+
np.random.seed(random_state)
|
59
|
+
torch.manual_seed(random_state)
|
53
60
|
|
54
61
|
def features_knn(
|
55
62
|
x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
|
@@ -594,7 +601,7 @@ def get_features(
|
|
594
601
|
"""
|
595
602
|
from sklearn.compose import ColumnTransformer
|
596
603
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
597
|
-
|
604
|
+
from sklearn.model_selection import train_test_split
|
598
605
|
# Ensure X and y are DataFrames/Series for consistency
|
599
606
|
if isinstance(X, np.ndarray):
|
600
607
|
X = pd.DataFrame(X)
|
@@ -922,10 +929,26 @@ def get_features(
|
|
922
929
|
"feature_importances": feature_importances,
|
923
930
|
}
|
924
931
|
if all([plot_, dir_save]):
|
932
|
+
|
925
933
|
from datetime import datetime
|
926
|
-
|
927
934
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
928
935
|
ips.figsave(dir_save + f"features{now_}.pdf")
|
936
|
+
|
937
|
+
lists = []
|
938
|
+
for tp in ips.flatten(features_df["type"]):
|
939
|
+
lists.append(
|
940
|
+
features_df
|
941
|
+
.loc[features_df["type"] == tp, "feature"]
|
942
|
+
.tolist()
|
943
|
+
)
|
944
|
+
labels = ips.flatten(features_df["type"])
|
945
|
+
# current_fig = plt.gcf()
|
946
|
+
# # ax = current_fig.add_subplot(3, 2, 6)
|
947
|
+
# gs = current_fig.add_gridspec(3, 2)
|
948
|
+
# ax = current_fig.add_subplot(gs[:, :])
|
949
|
+
plt.figure(figsize=[6,6])
|
950
|
+
plot.venn(lists, labels, cmap="coolwarm")
|
951
|
+
ips.figsave(dir_save + f"features{now_}shared_features.pdf")
|
929
952
|
else:
|
930
953
|
results = {
|
931
954
|
"selected_features": pd.DataFrame(),
|
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1247
1270
|
nexttile = plot.subplot(figsize=figsize)
|
1248
1271
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1249
1272
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1273
|
+
try:
|
1274
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1275
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1276
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1277
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1278
|
+
plot_roc_curve(
|
1279
|
+
fpr,
|
1280
|
+
tpr,
|
1281
|
+
mean_auc,
|
1282
|
+
lower_ci,
|
1283
|
+
upper_ci,
|
1284
|
+
model_name=model_name,
|
1285
|
+
lw=1.5,
|
1286
|
+
color=colors[i],
|
1287
|
+
alpha=alpha,
|
1288
|
+
ax=ax,
|
1289
|
+
)
|
1290
|
+
except Exception as e:
|
1291
|
+
print(e)
|
1266
1292
|
plot.figsets(
|
1267
1293
|
sp=2,
|
1268
1294
|
legend=dict(
|
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1277
1303
|
|
1278
1304
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1279
1305
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1306
|
+
try:
|
1307
|
+
plot_pr_curve(
|
1308
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1309
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1310
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1311
|
+
model_name=model_name,
|
1312
|
+
color=colors[i],
|
1313
|
+
lw=1.5,
|
1314
|
+
alpha=alpha,
|
1315
|
+
ax=ax,
|
1316
|
+
)
|
1317
|
+
except Exception as e:
|
1318
|
+
print(e)
|
1290
1319
|
plot.figsets(
|
1291
1320
|
sp=2,
|
1292
1321
|
legend=dict(
|
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1314
1343
|
for iclass, class_ in enumerate(classes):
|
1315
1344
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1316
1345
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1346
|
+
try:
|
1347
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1348
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1349
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1350
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1351
|
+
plot_roc_curve(
|
1352
|
+
fpr,
|
1353
|
+
tpr,
|
1354
|
+
mean_auc,
|
1355
|
+
lower_ci,
|
1356
|
+
upper_ci,
|
1357
|
+
model_name=model_name,
|
1358
|
+
lw=1.5,
|
1359
|
+
color=colors[i],
|
1360
|
+
alpha=alpha,
|
1361
|
+
ax=ax,
|
1362
|
+
)
|
1363
|
+
except Exception as e:
|
1364
|
+
print(e)
|
1333
1365
|
plot.figsets(
|
1334
1366
|
sp=2,
|
1335
1367
|
title=class_,
|
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1345
1377
|
|
1346
1378
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1347
1379
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1380
|
+
try:
|
1381
|
+
plot_pr_curve(
|
1382
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1383
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1384
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1385
|
+
iclass
|
1386
|
+
],
|
1387
|
+
model_name=model_name,
|
1388
|
+
color=colors[i],
|
1389
|
+
lw=1.5,
|
1390
|
+
alpha=alpha,
|
1391
|
+
ax=ax,
|
1392
|
+
)
|
1393
|
+
except Exception as e:
|
1394
|
+
print(e)
|
1360
1395
|
plot.figsets(
|
1361
1396
|
sp=2,
|
1362
1397
|
title=class_,
|
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1379
1414
|
len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
|
1380
1415
|
)
|
1381
1416
|
for model_name in ips.flatten(res_val["pr_curve"].index):
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
# Plotting
|
1388
|
-
plot_roc_curve(
|
1389
|
-
fpr,
|
1390
|
-
tpr,
|
1391
|
-
mean_auc,
|
1392
|
-
lower_ci,
|
1393
|
-
upper_ci,
|
1394
|
-
model_name=model_name,
|
1395
|
-
ax=nexttile(),
|
1396
|
-
)
|
1397
|
-
plot.figsets(title=model_name, sp=2)
|
1417
|
+
try:
|
1418
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1419
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1420
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1421
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1398
1422
|
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1423
|
+
# Plotting
|
1424
|
+
plot_roc_curve(
|
1425
|
+
fpr,
|
1426
|
+
tpr,
|
1427
|
+
mean_auc,
|
1428
|
+
lower_ci,
|
1429
|
+
upper_ci,
|
1430
|
+
model_name=model_name,
|
1431
|
+
ax=nexttile(),
|
1432
|
+
)
|
1433
|
+
plot.figsets(title=model_name, sp=2)
|
1407
1434
|
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1435
|
+
plot_pr_binary(
|
1436
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1437
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1438
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1439
|
+
model_name=model_name,
|
1440
|
+
ax=nexttile(),
|
1441
|
+
)
|
1442
|
+
plot.figsets(title=model_name, sp=2)
|
1443
|
+
|
1444
|
+
# plot cm
|
1445
|
+
plot_cm(
|
1446
|
+
res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
|
1447
|
+
)
|
1448
|
+
plot.figsets(title=model_name, sp=2)
|
1449
|
+
|
1450
|
+
except Exception as e:
|
1451
|
+
print(e)
|
1413
1452
|
else:
|
1414
1453
|
|
1415
1454
|
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1424
1463
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1425
1464
|
ax = nexttile()
|
1426
1465
|
for iclass, class_ in enumerate(classes):
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1466
|
+
try:
|
1467
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1468
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1469
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1470
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1471
|
+
plot_roc_curve(
|
1472
|
+
fpr,
|
1473
|
+
tpr,
|
1474
|
+
mean_auc,
|
1475
|
+
lower_ci,
|
1476
|
+
upper_ci,
|
1477
|
+
model_name=class_,
|
1478
|
+
lw=1.5,
|
1479
|
+
color=colors[iclass],
|
1480
|
+
alpha=0.03,
|
1481
|
+
ax=ax,
|
1482
|
+
)
|
1483
|
+
except Exception as e:
|
1484
|
+
print(e)
|
1443
1485
|
plot.figsets(
|
1444
1486
|
sp=2,
|
1445
1487
|
title=model_name,
|
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1451
1493
|
|
1452
1494
|
ax = nexttile()
|
1453
1495
|
for iclass, class_ in enumerate(classes):
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1496
|
+
try:
|
1497
|
+
plot_pr_curve(
|
1498
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1499
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1500
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1501
|
+
iclass
|
1502
|
+
],
|
1503
|
+
model_name=class_,
|
1504
|
+
color=colors[iclass],
|
1505
|
+
lw=1.5,
|
1506
|
+
alpha=0.03,
|
1507
|
+
ax=ax,
|
1508
|
+
)
|
1509
|
+
except Exception as e:
|
1510
|
+
print(e)
|
1466
1511
|
plot.figsets(
|
1467
1512
|
sp=2,
|
1468
1513
|
title=class_,
|
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
|
|
1543
1588
|
# print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
|
1544
1589
|
sorted_scores = np.array(bootstrapped_scores)
|
1545
1590
|
sorted_scores.sort()
|
1546
|
-
|
1547
|
-
# Computing the lower and upper bound of the 90% confidence interval
|
1548
|
-
# You can change the bounds percentiles to 0.025 and 0.975 to get
|
1549
|
-
# a 95% confidence interval instead.
|
1591
|
+
|
1550
1592
|
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1551
1593
|
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1552
1594
|
if verbose:
|
1553
1595
|
print(
|
1554
|
-
"Confidence interval for the score: [{:0.3f} - {:0.
|
1596
|
+
"Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
|
1555
1597
|
confidence_lower, confidence_upper
|
1556
1598
|
)
|
1557
1599
|
)
|
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
|
|
1568
1610
|
y_true, classes=np.unique(y_true)
|
1569
1611
|
) # One-vs-Rest transformation
|
1570
1612
|
n_classes = y_true_bin.shape[1] # Number of classes
|
1571
|
-
|
1572
|
-
bootstrapped_scores = np.
|
1573
|
-
(n_classes, n_bootstraps)
|
1574
|
-
) # Store scores for each class
|
1575
|
-
|
1613
|
+
|
1614
|
+
bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
|
1576
1615
|
if verbose:
|
1577
1616
|
print("AUROC scores for each class:")
|
1578
1617
|
for i in range(n_classes):
|
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
|
|
1592
1631
|
# Calculating the confidence intervals for each class
|
1593
1632
|
confidence_intervals = []
|
1594
1633
|
for class_idx in range(n_classes):
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
)
|
1634
|
+
# rm nan
|
1635
|
+
valid_scores = bootstrapped_scores[class_idx][
|
1636
|
+
~np.isnan(bootstrapped_scores[class_idx])
|
1637
|
+
]
|
1638
|
+
if len(valid_scores) > 0:
|
1639
|
+
sorted_scores = np.sort(valid_scores)
|
1640
|
+
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1641
|
+
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1642
|
+
confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
|
1643
|
+
|
1644
|
+
if verbose:
|
1645
|
+
print(
|
1646
|
+
f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
|
1647
|
+
)
|
1648
|
+
else:
|
1649
|
+
confidence_intervals[class_idx] = (np.nan, np.nan)
|
1650
|
+
if verbose:
|
1651
|
+
print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
|
1604
1652
|
|
1605
1653
|
return confidence_intervals
|
1606
1654
|
|
@@ -2057,20 +2105,20 @@ def rank_models(
|
|
2057
2105
|
|
2058
2106
|
def generate_bar_plot(ax, cv_test_scores):
|
2059
2107
|
ax = plot.plotxy(
|
2060
|
-
y="Classifier", x="combined_score", data=cv_test_scores,
|
2108
|
+
y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
|
2061
2109
|
)
|
2062
2110
|
plt.title("Classifier Performance")
|
2063
2111
|
plt.tight_layout()
|
2064
2112
|
return plt
|
2065
2113
|
|
2066
|
-
nexttile = plot.subplot(2, 2, figsize=[10,
|
2114
|
+
nexttile = plot.subplot(2, 2, figsize=[10, 10])
|
2067
2115
|
generate_bar_plot(nexttile(), top_models.dropna())
|
2068
2116
|
plot.radar(
|
2069
2117
|
ax=nexttile(projection="polar"),
|
2070
2118
|
data=cv_test_scores.set_index("Classifier"),
|
2071
|
-
ylim=[0
|
2072
|
-
color=plot.get_color(
|
2073
|
-
alpha=0.
|
2119
|
+
ylim=[0, 1],
|
2120
|
+
color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
|
2121
|
+
alpha=0.02,
|
2074
2122
|
circular=1,
|
2075
2123
|
)
|
2076
2124
|
return cv_test_scores
|
@@ -2216,7 +2264,7 @@ def predict(
|
|
2216
2264
|
metrics: Optional[List[str]] = None,
|
2217
2265
|
stack:bool=True,# run stacking
|
2218
2266
|
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2219
|
-
vote:bool=
|
2267
|
+
vote:bool=False,# run voting
|
2220
2268
|
voting:str="hard", # only for classification purporse of voting
|
2221
2269
|
n_top_models:int=5, #for stacking models
|
2222
2270
|
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
@@ -2229,7 +2277,12 @@ def predict(
|
|
2229
2277
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2230
2278
|
class_weight: str = "balanced",
|
2231
2279
|
random_state: int = 1,
|
2280
|
+
presets = "best_quality",# specific for autogluon
|
2281
|
+
time_limit=600, # specific for autogluon
|
2282
|
+
num_bag_folds=5, # specific for autogluon
|
2283
|
+
num_stack_levels=2, # specific for autogluon
|
2232
2284
|
verbose: bool = False,
|
2285
|
+
**kwargs
|
2233
2286
|
) -> pd.DataFrame:
|
2234
2287
|
"""
|
2235
2288
|
第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
|
@@ -2280,28 +2333,20 @@ def predict(
|
|
2280
2333
|
RandomForestRegressor,
|
2281
2334
|
ExtraTreesClassifier,
|
2282
2335
|
ExtraTreesRegressor,
|
2336
|
+
HistGradientBoostingRegressor,
|
2283
2337
|
BaggingClassifier,
|
2284
2338
|
BaggingRegressor,
|
2285
2339
|
AdaBoostClassifier,
|
2286
2340
|
AdaBoostRegressor,
|
2287
2341
|
)
|
2288
|
-
from sklearn.svm import SVC, SVR
|
2289
|
-
from sklearn.tree import DecisionTreeRegressor
|
2342
|
+
from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
|
2343
|
+
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
|
2290
2344
|
from sklearn.linear_model import (
|
2291
|
-
LogisticRegression,
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
RidgeClassifierCV,
|
2297
|
-
Perceptron,
|
2298
|
-
SGDClassifier,
|
2299
|
-
RidgeCV,
|
2300
|
-
Ridge,
|
2301
|
-
TheilSenRegressor,
|
2302
|
-
HuberRegressor,
|
2303
|
-
PoissonRegressor,
|
2304
|
-
|
2345
|
+
LogisticRegression,ElasticNet,ElasticNetCV,
|
2346
|
+
LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
|
2347
|
+
RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
|
2348
|
+
GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
|
2349
|
+
OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
|
2305
2350
|
)
|
2306
2351
|
from sklearn.compose import TransformedTargetRegressor
|
2307
2352
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
@@ -2318,10 +2363,16 @@ def predict(
|
|
2318
2363
|
)
|
2319
2364
|
from sklearn.preprocessing import PolynomialFeatures
|
2320
2365
|
from sklearn.model_selection import train_test_split
|
2321
|
-
|
2366
|
+
|
2367
|
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
2368
|
+
from sklearn.kernel_ridge import KernelRidge
|
2369
|
+
from sklearn.dummy import DummyRegressor
|
2370
|
+
from autogluon.tabular import TabularPredictor
|
2322
2371
|
# 拼写检查
|
2323
2372
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
2324
2373
|
print(f"{purpose} processing...")
|
2374
|
+
|
2375
|
+
|
2325
2376
|
# Default models or regressors if not provided
|
2326
2377
|
if purpose == "classification":
|
2327
2378
|
model_ = {
|
@@ -2374,27 +2425,44 @@ def predict(
|
|
2374
2425
|
model_ = {
|
2375
2426
|
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2376
2427
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2377
|
-
|
2378
|
-
"LassoCV": LassoCV(
|
2379
|
-
cv=cv_folds, random_state=random_state,n_jobs=n_jobs
|
2380
|
-
), # LassoCV自动找出最适alpha,优于Lasso
|
2428
|
+
"LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
|
2381
2429
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2382
2430
|
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2383
2431
|
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2384
2432
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2385
|
-
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
|
2386
|
-
force_row_wise=True # Or use force_col_wise=True if memory is a concern
|
2387
|
-
),
|
2433
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
|
2388
2434
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2389
2435
|
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2390
2436
|
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2391
2437
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2392
2438
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2393
|
-
"Ridge": Ridge(),
|
2439
|
+
"Ridge": Ridge(random_state=random_state),
|
2394
2440
|
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2395
2441
|
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2396
2442
|
"Huber":HuberRegressor(),
|
2397
|
-
"Poisson":PoissonRegressor()
|
2443
|
+
"Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
|
2444
|
+
"Lasso": Lasso(random_state=random_state),
|
2445
|
+
"Lars": Lars(),
|
2446
|
+
"LassoLars": LassoLars(),
|
2447
|
+
"BayesianRidge": BayesianRidge(),
|
2448
|
+
"GammaRegressor": GammaRegressor(),
|
2449
|
+
"TweedieRegressor": TweedieRegressor(),
|
2450
|
+
"LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
|
2451
|
+
"ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
|
2452
|
+
"LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
|
2453
|
+
"LarsCV": LarsCV(),
|
2454
|
+
"OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
|
2455
|
+
"OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
|
2456
|
+
"PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
|
2457
|
+
"LinearSVR": LinearSVR(random_state=random_state),
|
2458
|
+
"NuSVR": NuSVR(),
|
2459
|
+
"DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
|
2460
|
+
"ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
|
2461
|
+
"HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
|
2462
|
+
"GaussianProcessRegressor": GaussianProcessRegressor(),
|
2463
|
+
"KernelRidge": KernelRidge(),
|
2464
|
+
"DummyRegressor": DummyRegressor(),
|
2465
|
+
"TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
|
2398
2466
|
}
|
2399
2467
|
if cls is None:
|
2400
2468
|
models = model_
|
@@ -2411,10 +2479,17 @@ def predict(
|
|
2411
2479
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2412
2480
|
)
|
2413
2481
|
|
2482
|
+
# only keep "autogluon_tab" in models
|
2483
|
+
cls = [cls] if isinstance(cls, str) else cls
|
2484
|
+
|
2485
|
+
if cls is not None:
|
2486
|
+
models={"autogluon_tab":None} if "auto" in cls else models
|
2487
|
+
|
2414
2488
|
# indicate cls:
|
2415
2489
|
if ips.run_once_within(30): # 10 min
|
2416
2490
|
print(f"processing: {list(models.keys())}")
|
2417
|
-
|
2491
|
+
y_train_col_name=None
|
2492
|
+
# print(isinstance(y_train, str) and y_train in x_train.columns)
|
2418
2493
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2419
2494
|
y_train_col_name = y_train
|
2420
2495
|
y_train = x_train[y_train]
|
@@ -2531,19 +2606,261 @@ def predict(
|
|
2531
2606
|
if isinstance(y_train, np.ndarray):
|
2532
2607
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2533
2608
|
y_true = np.asarray(y_true)
|
2609
|
+
#! so far, got the: x_train,x_true,y_train,y_true
|
2610
|
+
# Grid search with KFold or StratifiedKFold
|
2611
|
+
if "autogluon_tab" in models:
|
2612
|
+
# load hypoer_param
|
2613
|
+
f_param = os.path.dirname(os.path.abspath(__file__))
|
2614
|
+
f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
|
2615
|
+
with open(f_param, "r") as file:
|
2616
|
+
hyper_param_autogluon = json.load(file)
|
2617
|
+
# Train the model with AutoGluon
|
2618
|
+
features=x_train.columns.tolist()
|
2619
|
+
label= y_train_col_name if y_train_col_name is not None else 'target'
|
2620
|
+
df_autogluon = x_train.copy()
|
2621
|
+
df_autogluon[label]=y_train
|
2622
|
+
autogluon_presets=["best_quality","good_quality","fast_train"]
|
2623
|
+
best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
|
2624
|
+
train_data=df_autogluon,
|
2625
|
+
presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
|
2626
|
+
time_limit=time_limit,#3600, # in sec: Limit training time,
|
2627
|
+
num_bag_folds=num_bag_folds,
|
2628
|
+
num_stack_levels=num_stack_levels,
|
2629
|
+
hyperparameters=hyper_param_autogluon,
|
2630
|
+
verbosity=1 if verbose else 0,
|
2631
|
+
**kwargs
|
2632
|
+
)
|
2633
|
+
#! Get the leaderboard
|
2634
|
+
gs={}
|
2635
|
+
# Display the leaderboard for reference
|
2636
|
+
leaderboard = best_clf.leaderboard()
|
2637
|
+
gs['info']=best_clf.info()
|
2638
|
+
# gs["res"]=best_clf
|
2639
|
+
gs["features"]=features
|
2640
|
+
gs["leaderboard"] = leaderboard
|
2641
|
+
best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
|
2642
|
+
# Store the best model and its details in the gs dictionary
|
2643
|
+
gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
|
2644
|
+
gs["best_params_"] = best_model_name # Hyperparameters
|
2645
|
+
# Make predictions if x_true is provided
|
2646
|
+
if x_true is not None:
|
2647
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2648
|
+
gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
|
2649
|
+
gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
|
2650
|
+
x_true[label]=gs["predictions"]
|
2651
|
+
if gs["predictions"].value_counts().shape[0]>1:
|
2652
|
+
gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
|
2653
|
+
gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
|
2654
|
+
all_models = gs["models"]
|
2655
|
+
model_evaluations = {}
|
2656
|
+
for model in all_models:
|
2657
|
+
predictions = best_clf.predict(x_true[features], model=model)
|
2658
|
+
evaluation = best_clf.evaluate_predictions(
|
2659
|
+
y_true=x_true[label], # True labels
|
2660
|
+
y_pred=predictions, # Predictions from the specific model
|
2661
|
+
auxiliary_metrics=True, # Include additional metrics if needed
|
2662
|
+
)
|
2663
|
+
model_evaluations[model] = evaluation
|
2664
|
+
gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
|
2665
|
+
#! 试着保持一样的格式
|
2666
|
+
results = {}
|
2667
|
+
for model in all_models:
|
2668
|
+
y_pred = best_clf.predict(x_true[features], model=model).tolist()
|
2669
|
+
y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
|
2670
|
+
|
2671
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
2672
|
+
y_pred_proba=y_pred_proba.iloc[:,1]
|
2673
|
+
|
2674
|
+
# try to make predict format consistant
|
2675
|
+
try:
|
2676
|
+
y_pred= [i[0] for i in y_pred]
|
2677
|
+
except:
|
2678
|
+
pass
|
2679
|
+
try:
|
2680
|
+
y_true= [i[0] for i in y_true]
|
2681
|
+
except:
|
2682
|
+
pass
|
2683
|
+
try:
|
2684
|
+
y_train= [i[0] for i in y_train]
|
2685
|
+
except:
|
2686
|
+
pass
|
2687
|
+
validation_scores = {}
|
2688
|
+
if y_true is not None and y_pred_proba is not None:
|
2689
|
+
validation_scores = cal_metrics(
|
2690
|
+
y_true,
|
2691
|
+
y_pred,
|
2692
|
+
y_pred_proba=y_pred_proba,
|
2693
|
+
is_binary=is_binary,
|
2694
|
+
purpose=purpose,
|
2695
|
+
average="weighted",
|
2696
|
+
)
|
2697
|
+
if is_binary:
|
2698
|
+
# Calculate ROC curve
|
2699
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
2700
|
+
if y_pred_proba is not None:
|
2701
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2702
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2703
|
+
lower_ci, upper_ci = cal_auc_ci(
|
2704
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2705
|
+
)
|
2706
|
+
roc_auc = auc(fpr, tpr)
|
2707
|
+
roc_info = {
|
2708
|
+
"fpr": fpr.tolist(),
|
2709
|
+
"tpr": tpr.tolist(),
|
2710
|
+
"auc": roc_auc,
|
2711
|
+
"ci95": (lower_ci, upper_ci),
|
2712
|
+
}
|
2713
|
+
# precision-recall curve
|
2714
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
|
2715
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
2716
|
+
pr_info = {
|
2717
|
+
"precision": precision_,
|
2718
|
+
"recall": recall_,
|
2719
|
+
"avg_precision": avg_precision_,
|
2720
|
+
}
|
2721
|
+
else:
|
2722
|
+
roc_info, pr_info = None, None
|
2723
|
+
if purpose == "classification":
|
2724
|
+
results[model] = {
|
2725
|
+
# "best_clf": gs.best_estimator_,
|
2726
|
+
# "best_params": gs.best_params_,
|
2727
|
+
# "auc_indiv": [
|
2728
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2729
|
+
# for i in range(cv_folds)
|
2730
|
+
# ],
|
2731
|
+
"scores": validation_scores,
|
2732
|
+
"roc_curve": roc_info,
|
2733
|
+
"pr_curve": pr_info,
|
2734
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2735
|
+
"predictions": y_pred,#.tolist(),
|
2736
|
+
"predictions_proba": (
|
2737
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2738
|
+
),
|
2739
|
+
"features":features,
|
2740
|
+
# "coef":coef_,
|
2741
|
+
# "alphas":alphas_
|
2742
|
+
}
|
2743
|
+
else: # "regression"
|
2744
|
+
results[model] = {
|
2745
|
+
# "best_clf": gs.best_estimator_,
|
2746
|
+
# "best_params": gs.best_params_,
|
2747
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2748
|
+
"predictions": y_pred,#.tolist(),
|
2749
|
+
"predictions_proba": (
|
2750
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2751
|
+
),
|
2752
|
+
"features":features,
|
2753
|
+
# "coef":coef_,
|
2754
|
+
# "alphas":alphas_
|
2755
|
+
}
|
2756
|
+
else: # multi-classes
|
2757
|
+
if y_pred_proba is not None:
|
2758
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2759
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2760
|
+
confidence_intervals = cal_auc_ci(
|
2761
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2762
|
+
)
|
2763
|
+
roc_info = {
|
2764
|
+
"fpr": validation_scores["fpr"],
|
2765
|
+
"tpr": validation_scores["tpr"],
|
2766
|
+
"auc": validation_scores["roc_auc_by_class"],
|
2767
|
+
"ci95": confidence_intervals,
|
2768
|
+
}
|
2769
|
+
# precision-recall curve
|
2770
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
2771
|
+
y_true, y_pred_proba, is_binary=is_binary
|
2772
|
+
)
|
2773
|
+
pr_info = {
|
2774
|
+
"precision": precision_,
|
2775
|
+
"recall": recall_,
|
2776
|
+
"avg_precision": avg_precision_,
|
2777
|
+
}
|
2778
|
+
else:
|
2779
|
+
roc_info, pr_info = None, None
|
2780
|
+
|
2781
|
+
if purpose == "classification":
|
2782
|
+
results[model] = {
|
2783
|
+
# "best_clf": gs.best_estimator_,
|
2784
|
+
# "best_params": gs.best_params_,
|
2785
|
+
# "auc_indiv": [
|
2786
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2787
|
+
# for i in range(cv_folds)
|
2788
|
+
# ],
|
2789
|
+
"scores": validation_scores,
|
2790
|
+
"roc_curve": roc_info,
|
2791
|
+
"pr_curve": pr_info,
|
2792
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2793
|
+
"predictions": y_pred,#.tolist(),
|
2794
|
+
"predictions_proba": (
|
2795
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2796
|
+
),
|
2797
|
+
"features":features,
|
2798
|
+
# "coef":coef_,
|
2799
|
+
# "alphas":alphas_
|
2800
|
+
}
|
2801
|
+
else: # "regression"
|
2802
|
+
results[model] = {
|
2803
|
+
# "best_clf": gs.best_estimator_,
|
2804
|
+
# "best_params": gs.best_params_,
|
2805
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2806
|
+
"predictions": y_pred,#.tolist(),
|
2807
|
+
"predictions_proba": (
|
2808
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2809
|
+
),
|
2810
|
+
"features":features,
|
2811
|
+
# "coef":coef_,
|
2812
|
+
# "alphas":alphas_
|
2813
|
+
}
|
2534
2814
|
|
2535
|
-
|
2536
|
-
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2541
|
-
|
2542
|
-
|
2543
|
-
|
2544
|
-
|
2545
|
-
|
2815
|
+
else:
|
2816
|
+
if y_true is None:
|
2817
|
+
validation_scores = []
|
2818
|
+
else:
|
2819
|
+
validation_scores = cal_metrics(
|
2820
|
+
y_true,
|
2821
|
+
y_pred,
|
2822
|
+
y_pred_proba=y_pred_proba,
|
2823
|
+
is_binary=is_binary,
|
2824
|
+
purpose=purpose,
|
2825
|
+
average="weighted",
|
2826
|
+
)
|
2827
|
+
results[model] = {
|
2828
|
+
# "best_clf": gs.best_estimator_,
|
2829
|
+
# "best_params": gs.best_params_,
|
2830
|
+
"scores": validation_scores,
|
2831
|
+
"predictions": y_pred,#.tolist(),
|
2832
|
+
"predictions_proba": (
|
2833
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2834
|
+
),
|
2835
|
+
"features":features,
|
2836
|
+
"y_train": y_train if y_train is not None else [],
|
2837
|
+
"y_true": y_true if y_true is not None else [],
|
2838
|
+
# "coef":coef_,
|
2839
|
+
# "alphas":alphas_
|
2546
2840
|
}
|
2841
|
+
df_results = pd.DataFrame.from_dict(results, orient="index")
|
2842
|
+
gs['res']=df_results
|
2843
|
+
|
2844
|
+
if all([plot_, y_true is not None, purpose == "classification"]):
|
2845
|
+
from datetime import datetime
|
2846
|
+
|
2847
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
2848
|
+
# try:
|
2849
|
+
if df_results.shape[0] > 3:
|
2850
|
+
try:
|
2851
|
+
plot_validate_features(df_results, is_binary=is_binary)
|
2852
|
+
except Exception as e:
|
2853
|
+
print(e)
|
2854
|
+
else:
|
2855
|
+
try:
|
2856
|
+
plot_validate_features_single(df_results, is_binary=is_binary)
|
2857
|
+
except Exception as e:
|
2858
|
+
print(e)
|
2859
|
+
if dir_save:
|
2860
|
+
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
2861
|
+
return gs
|
2862
|
+
|
2863
|
+
#! cross_valid
|
2547
2864
|
if cv_level in ["low", "simple", "s", "l"]:
|
2548
2865
|
param_grids = {
|
2549
2866
|
"Random Forest": (
|
@@ -2712,7 +3029,73 @@ def predict(
|
|
2712
3029
|
'alpha': [0.1],
|
2713
3030
|
'max_iter': [100],},
|
2714
3031
|
"Poisson":{'alpha': [0.1],
|
2715
|
-
'max_iter': [100],}
|
3032
|
+
'max_iter': [100],},
|
3033
|
+
"Lars": {"n_nonzero_coefs": [10, 50, None]},
|
3034
|
+
"LassoLars": {
|
3035
|
+
"alpha": [0.01, 0.1, 1]
|
3036
|
+
},
|
3037
|
+
"BayesianRidge": {
|
3038
|
+
"alpha_1": [1e-6, 1e-4, 1e-2],
|
3039
|
+
"lambda_1": [1e-6, 1e-4, 1e-2]
|
3040
|
+
},
|
3041
|
+
"GammaRegressor": {
|
3042
|
+
"alpha": [0.1, 1, 10]
|
3043
|
+
},
|
3044
|
+
"TweedieRegressor": {
|
3045
|
+
"alpha": [0.1, 1, 10],
|
3046
|
+
"power": [1, 1.5, 2]
|
3047
|
+
},
|
3048
|
+
"LassoCV": {
|
3049
|
+
"cv": [5]
|
3050
|
+
},
|
3051
|
+
"ElasticNetCV": {
|
3052
|
+
"l1_ratio": [0.2, 0.5, 0.8],
|
3053
|
+
"cv": [5]
|
3054
|
+
},
|
3055
|
+
"LassoLarsCV": {
|
3056
|
+
"cv": [5]
|
3057
|
+
},
|
3058
|
+
"LarsCV": {
|
3059
|
+
"cv": [5]
|
3060
|
+
},
|
3061
|
+
"OrthogonalMatchingPursuit": {
|
3062
|
+
"n_nonzero_coefs": [10, 50, None]
|
3063
|
+
},
|
3064
|
+
"OrthogonalMatchingPursuitCV": {
|
3065
|
+
"cv": [5]
|
3066
|
+
},
|
3067
|
+
"PassiveAggressiveRegressor": {
|
3068
|
+
"C": [0.1, 1, 10]
|
3069
|
+
},
|
3070
|
+
"LinearSVR": {
|
3071
|
+
"C": [0.1, 1, 10]
|
3072
|
+
},
|
3073
|
+
"NuSVR": {
|
3074
|
+
"C": [0.1, 1, 10]
|
3075
|
+
},
|
3076
|
+
"DecisionTreeRegressor": {
|
3077
|
+
"max_depth": [5, 10, None]
|
3078
|
+
},
|
3079
|
+
"ExtraTreeRegressor": {
|
3080
|
+
"max_depth": [5, 10, None]
|
3081
|
+
},
|
3082
|
+
"HistGradientBoostingRegressor": {
|
3083
|
+
"learning_rate": [0.05, 0.1, 0.2],
|
3084
|
+
"max_depth": [5, 10, None]
|
3085
|
+
},
|
3086
|
+
"GaussianProcessRegressor": {
|
3087
|
+
"alpha": [1e-5, 1e-2, 0.1]
|
3088
|
+
},
|
3089
|
+
"KernelRidge": {
|
3090
|
+
"alpha": [0.1, 1, 10],
|
3091
|
+
"kernel": ["linear", "rbf"]
|
3092
|
+
},
|
3093
|
+
"DummyRegressor": {
|
3094
|
+
"strategy": ["mean", "median"]
|
3095
|
+
},
|
3096
|
+
"TransformedTargetRegressor": {
|
3097
|
+
"regressor__fit_intercept": [True, False]
|
3098
|
+
}
|
2716
3099
|
}
|
2717
3100
|
elif cv_level in ["high", "advanced", "h"]:
|
2718
3101
|
param_grids = {
|
@@ -2917,7 +3300,96 @@ def predict(
|
|
2917
3300
|
'alpha': [0.1, 1.0, 10.0],
|
2918
3301
|
'max_iter': [100, 200, 300],},
|
2919
3302
|
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2920
|
-
'max_iter': [100, 200, 300],}
|
3303
|
+
'max_iter': [100, 200, 300],},
|
3304
|
+
"Lars": {
|
3305
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3306
|
+
},
|
3307
|
+
"LassoLars": {
|
3308
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3309
|
+
},
|
3310
|
+
"BayesianRidge": {
|
3311
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3312
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3313
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3314
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3315
|
+
},
|
3316
|
+
"GammaRegressor": {
|
3317
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3318
|
+
"max_iter": [1000, 5000, 10000]
|
3319
|
+
},
|
3320
|
+
"TweedieRegressor": {
|
3321
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3322
|
+
"power": [0, 1, 1.5, 2, 3]
|
3323
|
+
},
|
3324
|
+
"LassoCV": {
|
3325
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3326
|
+
"cv": [3, 5, 10]
|
3327
|
+
},
|
3328
|
+
"ElasticNetCV": {
|
3329
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3330
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3331
|
+
"cv": [3, 5, 10]
|
3332
|
+
},
|
3333
|
+
"LassoLarsCV": {
|
3334
|
+
"cv": [3, 5, 10]
|
3335
|
+
},
|
3336
|
+
"LarsCV": {
|
3337
|
+
"cv": [3, 5, 10]
|
3338
|
+
},
|
3339
|
+
"OrthogonalMatchingPursuit": {
|
3340
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3341
|
+
},
|
3342
|
+
"OrthogonalMatchingPursuitCV": {
|
3343
|
+
"cv": [3, 5, 10]
|
3344
|
+
},
|
3345
|
+
"PassiveAggressiveRegressor": {
|
3346
|
+
"C": [0.01, 0.1, 1, 10],
|
3347
|
+
"max_iter": [1000, 5000, 10000],
|
3348
|
+
"early_stopping": [True, False]
|
3349
|
+
},
|
3350
|
+
"LinearSVR": {
|
3351
|
+
"C": [0.01, 0.1, 1, 10],
|
3352
|
+
"epsilon": [0.01, 0.1, 1],
|
3353
|
+
"max_iter": [1000, 5000, 10000]
|
3354
|
+
},
|
3355
|
+
"NuSVR": {
|
3356
|
+
"C": [0.01, 0.1, 1, 10],
|
3357
|
+
"nu": [0.25, 0.5, 0.75],
|
3358
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3359
|
+
},
|
3360
|
+
"DecisionTreeRegressor": {
|
3361
|
+
"max_depth": [None, 5, 10, 20],
|
3362
|
+
"min_samples_split": [2, 5, 10],
|
3363
|
+
"min_samples_leaf": [1, 2, 4]
|
3364
|
+
},
|
3365
|
+
"ExtraTreeRegressor": {
|
3366
|
+
"max_depth": [None, 5, 10, 20],
|
3367
|
+
"min_samples_split": [2, 5, 10],
|
3368
|
+
"min_samples_leaf": [1, 2, 4]
|
3369
|
+
},
|
3370
|
+
"HistGradientBoostingRegressor": {
|
3371
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3372
|
+
"max_iter": [100, 500, 1000],
|
3373
|
+
"max_depth": [None, 5, 10, 20],
|
3374
|
+
"min_samples_leaf": [1, 2, 4]
|
3375
|
+
},
|
3376
|
+
"GaussianProcessRegressor": {
|
3377
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3378
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3379
|
+
},
|
3380
|
+
"KernelRidge": {
|
3381
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3382
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3383
|
+
"degree": [2, 3, 4]
|
3384
|
+
},
|
3385
|
+
"DummyRegressor": {
|
3386
|
+
"strategy": ["mean", "median", "constant"],
|
3387
|
+
"constant": [0] # Only if strategy is 'constant'
|
3388
|
+
},
|
3389
|
+
"TransformedTargetRegressor": {
|
3390
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3391
|
+
"regressor__fit_intercept": [True, False]
|
3392
|
+
}
|
2921
3393
|
}
|
2922
3394
|
else: # median level
|
2923
3395
|
param_grids = {
|
@@ -3164,7 +3636,96 @@ def predict(
|
|
3164
3636
|
'alpha': [0.1, 1.0],
|
3165
3637
|
'max_iter': [100, 200],},
|
3166
3638
|
"Poisson":{'alpha': [0.1, 1.0],
|
3167
|
-
'max_iter': [100, 200],}
|
3639
|
+
'max_iter': [100, 200],},
|
3640
|
+
"Lars": {
|
3641
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3642
|
+
},
|
3643
|
+
"LassoLars": {
|
3644
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3645
|
+
},
|
3646
|
+
"BayesianRidge": {
|
3647
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3648
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3649
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3650
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3651
|
+
},
|
3652
|
+
"GammaRegressor": {
|
3653
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3654
|
+
"max_iter": [1000, 5000, 10000]
|
3655
|
+
},
|
3656
|
+
"TweedieRegressor": {
|
3657
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3658
|
+
"power": [0, 1, 1.5, 2, 3]
|
3659
|
+
},
|
3660
|
+
"LassoCV": {
|
3661
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3662
|
+
"cv": [3, 5, 10]
|
3663
|
+
},
|
3664
|
+
"ElasticNetCV": {
|
3665
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3666
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3667
|
+
"cv": [3, 5, 10]
|
3668
|
+
},
|
3669
|
+
"LassoLarsCV": {
|
3670
|
+
"cv": [3, 5, 10]
|
3671
|
+
},
|
3672
|
+
"LarsCV": {
|
3673
|
+
"cv": [3, 5, 10]
|
3674
|
+
},
|
3675
|
+
"OrthogonalMatchingPursuit": {
|
3676
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3677
|
+
},
|
3678
|
+
"OrthogonalMatchingPursuitCV": {
|
3679
|
+
"cv": [3, 5, 10]
|
3680
|
+
},
|
3681
|
+
"PassiveAggressiveRegressor": {
|
3682
|
+
"C": [0.01, 0.1, 1, 10],
|
3683
|
+
"max_iter": [1000, 5000, 10000],
|
3684
|
+
"early_stopping": [True, False]
|
3685
|
+
},
|
3686
|
+
"LinearSVR": {
|
3687
|
+
"C": [0.01, 0.1, 1, 10],
|
3688
|
+
"epsilon": [0.01, 0.1, 1],
|
3689
|
+
"max_iter": [1000, 5000, 10000]
|
3690
|
+
},
|
3691
|
+
"NuSVR": {
|
3692
|
+
"C": [0.01, 0.1, 1, 10],
|
3693
|
+
"nu": [0.25, 0.5, 0.75],
|
3694
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3695
|
+
},
|
3696
|
+
"DecisionTreeRegressor": {
|
3697
|
+
"max_depth": [None, 5, 10, 20],
|
3698
|
+
"min_samples_split": [2, 5, 10],
|
3699
|
+
"min_samples_leaf": [1, 2, 4]
|
3700
|
+
},
|
3701
|
+
"ExtraTreeRegressor": {
|
3702
|
+
"max_depth": [None, 5, 10, 20],
|
3703
|
+
"min_samples_split": [2, 5, 10],
|
3704
|
+
"min_samples_leaf": [1, 2, 4]
|
3705
|
+
},
|
3706
|
+
"HistGradientBoostingRegressor": {
|
3707
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3708
|
+
"max_iter": [100, 500, 1000],
|
3709
|
+
"max_depth": [None, 5, 10, 20],
|
3710
|
+
"min_samples_leaf": [1, 2, 4]
|
3711
|
+
},
|
3712
|
+
"GaussianProcessRegressor": {
|
3713
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3714
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3715
|
+
},
|
3716
|
+
"KernelRidge": {
|
3717
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3718
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3719
|
+
"degree": [2, 3, 4]
|
3720
|
+
},
|
3721
|
+
"DummyRegressor": {
|
3722
|
+
"strategy": ["mean", "median", "constant"],
|
3723
|
+
"constant": [0] # Only if strategy is 'constant'
|
3724
|
+
},
|
3725
|
+
"TransformedTargetRegressor": {
|
3726
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3727
|
+
"regressor__fit_intercept": [True, False]
|
3728
|
+
}
|
3168
3729
|
}
|
3169
3730
|
|
3170
3731
|
results = {}
|
@@ -3174,7 +3735,7 @@ def predict(
|
|
3174
3735
|
if purpose == "classification"
|
3175
3736
|
else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
3176
3737
|
)
|
3177
|
-
|
3738
|
+
|
3178
3739
|
# Train and validate each model
|
3179
3740
|
for name, clf in tqdm(
|
3180
3741
|
models.items(),
|
@@ -3185,8 +3746,7 @@ def predict(
|
|
3185
3746
|
if verbose:
|
3186
3747
|
print(f"\nTraining and validating {name}:")
|
3187
3748
|
try:
|
3188
|
-
|
3189
|
-
if is_binary:
|
3749
|
+
if is_binary:
|
3190
3750
|
gs = GridSearchCV(
|
3191
3751
|
clf,
|
3192
3752
|
param_grid=param_grids.get(name, {}),
|
@@ -3202,6 +3762,7 @@ def predict(
|
|
3202
3762
|
|
3203
3763
|
gs.fit(x_train, y_train)
|
3204
3764
|
best_clf = gs.best_estimator_
|
3765
|
+
|
3205
3766
|
# make sure x_train and x_test has the same name
|
3206
3767
|
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3207
3768
|
y_pred = best_clf.predict(x_true)
|
@@ -3212,7 +3773,14 @@ def predict(
|
|
3212
3773
|
y_pred_proba = np.hstack(
|
3213
3774
|
[1 - y_pred_proba, y_pred_proba]
|
3214
3775
|
) # Add missing class probabilities
|
3215
|
-
|
3776
|
+
if y_pred_proba.shape[1] == 2:
|
3777
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
3778
|
+
y_pred_proba = y_pred_proba.iloc[:, 1]
|
3779
|
+
elif isinstance(y_pred_proba, pd.Series):
|
3780
|
+
y_pred_proba = y_pred_proba.values[:, 1]
|
3781
|
+
else:
|
3782
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3783
|
+
|
3216
3784
|
elif hasattr(best_clf, "decision_function"):
|
3217
3785
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3218
3786
|
y_pred_proba = best_clf.decision_function(x_true)
|
@@ -3233,7 +3801,7 @@ def predict(
|
|
3233
3801
|
else:
|
3234
3802
|
alphas_= None
|
3235
3803
|
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3236
|
-
else:
|
3804
|
+
else:
|
3237
3805
|
gs = GridSearchCV(
|
3238
3806
|
clf,
|
3239
3807
|
param_grid=param_grids.get(name, {}),
|
@@ -3289,6 +3857,7 @@ def predict(
|
|
3289
3857
|
alphas_,coef_ = None,None
|
3290
3858
|
print(f"skiped {clf}: {e}")
|
3291
3859
|
continue
|
3860
|
+
|
3292
3861
|
# try to make predict format consistant
|
3293
3862
|
try:
|
3294
3863
|
y_pred= [i[0] for i in y_pred]
|
@@ -3460,7 +4029,8 @@ def predict(
|
|
3460
4029
|
|
3461
4030
|
# Convert results to DataFrame
|
3462
4031
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3463
|
-
|
4032
|
+
display(df_results)
|
4033
|
+
# sort
|
3464
4034
|
if y_true is not None:
|
3465
4035
|
if purpose == "classification":
|
3466
4036
|
df_scores = pd.DataFrame(
|
@@ -4028,6 +4598,8 @@ def predict(
|
|
4028
4598
|
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
4029
4599
|
elif stack:
|
4030
4600
|
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
4601
|
+
else:
|
4602
|
+
df_res=df_results
|
4031
4603
|
|
4032
4604
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
4033
4605
|
from datetime import datetime
|