py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.27__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ips.py +1059 -114
- py2ls/ml2ls.py +758 -186
- py2ls/netfinder.py +204 -20
- py2ls/ocr.py +60 -4
- py2ls/plot.py +916 -141
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/METADATA +6 -1
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/RECORD +16 -14
- py2ls/data/usages_pd copy.json +0 -1105
- {py2ls-0.2.4.25.dist-info → py2ls-0.2.4.27.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -31,6 +31,7 @@ from sklearn.metrics import (
|
|
31
31
|
average_precision_score,
|
32
32
|
)
|
33
33
|
from typing import Dict, Any, Optional, List, Union
|
34
|
+
import os, json
|
34
35
|
import numpy as np
|
35
36
|
import pandas as pd
|
36
37
|
from . import ips
|
@@ -49,7 +50,13 @@ logger = logging.getLogger()
|
|
49
50
|
warnings.filterwarnings("ignore", category=UserWarning)
|
50
51
|
from sklearn.tree import DecisionTreeClassifier
|
51
52
|
from sklearn.neighbors import KNeighborsClassifier
|
52
|
-
|
53
|
+
#* set random_state global
|
54
|
+
import torch
|
55
|
+
import random
|
56
|
+
random_state=1
|
57
|
+
random.seed(random_state)
|
58
|
+
np.random.seed(random_state)
|
59
|
+
torch.manual_seed(random_state)
|
53
60
|
|
54
61
|
def features_knn(
|
55
62
|
x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
|
@@ -594,7 +601,7 @@ def get_features(
|
|
594
601
|
"""
|
595
602
|
from sklearn.compose import ColumnTransformer
|
596
603
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
597
|
-
|
604
|
+
from sklearn.model_selection import train_test_split
|
598
605
|
# Ensure X and y are DataFrames/Series for consistency
|
599
606
|
if isinstance(X, np.ndarray):
|
600
607
|
X = pd.DataFrame(X)
|
@@ -922,10 +929,26 @@ def get_features(
|
|
922
929
|
"feature_importances": feature_importances,
|
923
930
|
}
|
924
931
|
if all([plot_, dir_save]):
|
932
|
+
|
925
933
|
from datetime import datetime
|
926
|
-
|
927
934
|
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
928
935
|
ips.figsave(dir_save + f"features{now_}.pdf")
|
936
|
+
|
937
|
+
lists = []
|
938
|
+
for tp in ips.flatten(features_df["type"]):
|
939
|
+
lists.append(
|
940
|
+
features_df
|
941
|
+
.loc[features_df["type"] == tp, "feature"]
|
942
|
+
.tolist()
|
943
|
+
)
|
944
|
+
labels = ips.flatten(features_df["type"])
|
945
|
+
# current_fig = plt.gcf()
|
946
|
+
# # ax = current_fig.add_subplot(3, 2, 6)
|
947
|
+
# gs = current_fig.add_gridspec(3, 2)
|
948
|
+
# ax = current_fig.add_subplot(gs[:, :])
|
949
|
+
plt.figure(figsize=[6,6])
|
950
|
+
plot.venn(lists, labels, cmap="coolwarm")
|
951
|
+
ips.figsave(dir_save + f"features{now_}shared_features.pdf")
|
929
952
|
else:
|
930
953
|
results = {
|
931
954
|
"selected_features": pd.DataFrame(),
|
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1247
1270
|
nexttile = plot.subplot(figsize=figsize)
|
1248
1271
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1249
1272
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1273
|
+
try:
|
1274
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1275
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1276
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1277
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1278
|
+
plot_roc_curve(
|
1279
|
+
fpr,
|
1280
|
+
tpr,
|
1281
|
+
mean_auc,
|
1282
|
+
lower_ci,
|
1283
|
+
upper_ci,
|
1284
|
+
model_name=model_name,
|
1285
|
+
lw=1.5,
|
1286
|
+
color=colors[i],
|
1287
|
+
alpha=alpha,
|
1288
|
+
ax=ax,
|
1289
|
+
)
|
1290
|
+
except Exception as e:
|
1291
|
+
print(e)
|
1266
1292
|
plot.figsets(
|
1267
1293
|
sp=2,
|
1268
1294
|
legend=dict(
|
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1277
1303
|
|
1278
1304
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1279
1305
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1306
|
+
try:
|
1307
|
+
plot_pr_curve(
|
1308
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1309
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1310
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1311
|
+
model_name=model_name,
|
1312
|
+
color=colors[i],
|
1313
|
+
lw=1.5,
|
1314
|
+
alpha=alpha,
|
1315
|
+
ax=ax,
|
1316
|
+
)
|
1317
|
+
except Exception as e:
|
1318
|
+
print(e)
|
1290
1319
|
plot.figsets(
|
1291
1320
|
sp=2,
|
1292
1321
|
legend=dict(
|
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1314
1343
|
for iclass, class_ in enumerate(classes):
|
1315
1344
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1316
1345
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1346
|
+
try:
|
1347
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1348
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1349
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1350
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1351
|
+
plot_roc_curve(
|
1352
|
+
fpr,
|
1353
|
+
tpr,
|
1354
|
+
mean_auc,
|
1355
|
+
lower_ci,
|
1356
|
+
upper_ci,
|
1357
|
+
model_name=model_name,
|
1358
|
+
lw=1.5,
|
1359
|
+
color=colors[i],
|
1360
|
+
alpha=alpha,
|
1361
|
+
ax=ax,
|
1362
|
+
)
|
1363
|
+
except Exception as e:
|
1364
|
+
print(e)
|
1333
1365
|
plot.figsets(
|
1334
1366
|
sp=2,
|
1335
1367
|
title=class_,
|
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
|
|
1345
1377
|
|
1346
1378
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1347
1379
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1380
|
+
try:
|
1381
|
+
plot_pr_curve(
|
1382
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1383
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1384
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1385
|
+
iclass
|
1386
|
+
],
|
1387
|
+
model_name=model_name,
|
1388
|
+
color=colors[i],
|
1389
|
+
lw=1.5,
|
1390
|
+
alpha=alpha,
|
1391
|
+
ax=ax,
|
1392
|
+
)
|
1393
|
+
except Exception as e:
|
1394
|
+
print(e)
|
1360
1395
|
plot.figsets(
|
1361
1396
|
sp=2,
|
1362
1397
|
title=class_,
|
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1379
1414
|
len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
|
1380
1415
|
)
|
1381
1416
|
for model_name in ips.flatten(res_val["pr_curve"].index):
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
# Plotting
|
1388
|
-
plot_roc_curve(
|
1389
|
-
fpr,
|
1390
|
-
tpr,
|
1391
|
-
mean_auc,
|
1392
|
-
lower_ci,
|
1393
|
-
upper_ci,
|
1394
|
-
model_name=model_name,
|
1395
|
-
ax=nexttile(),
|
1396
|
-
)
|
1397
|
-
plot.figsets(title=model_name, sp=2)
|
1417
|
+
try:
|
1418
|
+
fpr = res_val["roc_curve"][model_name]["fpr"]
|
1419
|
+
tpr = res_val["roc_curve"][model_name]["tpr"]
|
1420
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
|
1421
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1398
1422
|
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1423
|
+
# Plotting
|
1424
|
+
plot_roc_curve(
|
1425
|
+
fpr,
|
1426
|
+
tpr,
|
1427
|
+
mean_auc,
|
1428
|
+
lower_ci,
|
1429
|
+
upper_ci,
|
1430
|
+
model_name=model_name,
|
1431
|
+
ax=nexttile(),
|
1432
|
+
)
|
1433
|
+
plot.figsets(title=model_name, sp=2)
|
1407
1434
|
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1435
|
+
plot_pr_binary(
|
1436
|
+
recall=res_val["pr_curve"][model_name]["recall"],
|
1437
|
+
precision=res_val["pr_curve"][model_name]["precision"],
|
1438
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
|
1439
|
+
model_name=model_name,
|
1440
|
+
ax=nexttile(),
|
1441
|
+
)
|
1442
|
+
plot.figsets(title=model_name, sp=2)
|
1443
|
+
|
1444
|
+
# plot cm
|
1445
|
+
plot_cm(
|
1446
|
+
res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
|
1447
|
+
)
|
1448
|
+
plot.figsets(title=model_name, sp=2)
|
1449
|
+
|
1450
|
+
except Exception as e:
|
1451
|
+
print(e)
|
1413
1452
|
else:
|
1414
1453
|
|
1415
1454
|
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1424
1463
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
1425
1464
|
ax = nexttile()
|
1426
1465
|
for iclass, class_ in enumerate(classes):
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1466
|
+
try:
|
1467
|
+
fpr = res_val["roc_curve"][model_name]["fpr"][class_]
|
1468
|
+
tpr = res_val["roc_curve"][model_name]["tpr"][class_]
|
1469
|
+
(lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
|
1470
|
+
mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
|
1471
|
+
plot_roc_curve(
|
1472
|
+
fpr,
|
1473
|
+
tpr,
|
1474
|
+
mean_auc,
|
1475
|
+
lower_ci,
|
1476
|
+
upper_ci,
|
1477
|
+
model_name=class_,
|
1478
|
+
lw=1.5,
|
1479
|
+
color=colors[iclass],
|
1480
|
+
alpha=0.03,
|
1481
|
+
ax=ax,
|
1482
|
+
)
|
1483
|
+
except Exception as e:
|
1484
|
+
print(e)
|
1443
1485
|
plot.figsets(
|
1444
1486
|
sp=2,
|
1445
1487
|
title=model_name,
|
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
|
1451
1493
|
|
1452
1494
|
ax = nexttile()
|
1453
1495
|
for iclass, class_ in enumerate(classes):
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1496
|
+
try:
|
1497
|
+
plot_pr_curve(
|
1498
|
+
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1499
|
+
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1500
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1501
|
+
iclass
|
1502
|
+
],
|
1503
|
+
model_name=class_,
|
1504
|
+
color=colors[iclass],
|
1505
|
+
lw=1.5,
|
1506
|
+
alpha=0.03,
|
1507
|
+
ax=ax,
|
1508
|
+
)
|
1509
|
+
except Exception as e:
|
1510
|
+
print(e)
|
1466
1511
|
plot.figsets(
|
1467
1512
|
sp=2,
|
1468
1513
|
title=class_,
|
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
|
|
1543
1588
|
# print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
|
1544
1589
|
sorted_scores = np.array(bootstrapped_scores)
|
1545
1590
|
sorted_scores.sort()
|
1546
|
-
|
1547
|
-
# Computing the lower and upper bound of the 90% confidence interval
|
1548
|
-
# You can change the bounds percentiles to 0.025 and 0.975 to get
|
1549
|
-
# a 95% confidence interval instead.
|
1591
|
+
|
1550
1592
|
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1551
1593
|
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1552
1594
|
if verbose:
|
1553
1595
|
print(
|
1554
|
-
"Confidence interval for the score: [{:0.3f} - {:0.
|
1596
|
+
"Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
|
1555
1597
|
confidence_lower, confidence_upper
|
1556
1598
|
)
|
1557
1599
|
)
|
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
|
|
1568
1610
|
y_true, classes=np.unique(y_true)
|
1569
1611
|
) # One-vs-Rest transformation
|
1570
1612
|
n_classes = y_true_bin.shape[1] # Number of classes
|
1571
|
-
|
1572
|
-
bootstrapped_scores = np.
|
1573
|
-
(n_classes, n_bootstraps)
|
1574
|
-
) # Store scores for each class
|
1575
|
-
|
1613
|
+
|
1614
|
+
bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
|
1576
1615
|
if verbose:
|
1577
1616
|
print("AUROC scores for each class:")
|
1578
1617
|
for i in range(n_classes):
|
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
|
|
1592
1631
|
# Calculating the confidence intervals for each class
|
1593
1632
|
confidence_intervals = []
|
1594
1633
|
for class_idx in range(n_classes):
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
)
|
1634
|
+
# rm nan
|
1635
|
+
valid_scores = bootstrapped_scores[class_idx][
|
1636
|
+
~np.isnan(bootstrapped_scores[class_idx])
|
1637
|
+
]
|
1638
|
+
if len(valid_scores) > 0:
|
1639
|
+
sorted_scores = np.sort(valid_scores)
|
1640
|
+
confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
|
1641
|
+
confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
|
1642
|
+
confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
|
1643
|
+
|
1644
|
+
if verbose:
|
1645
|
+
print(
|
1646
|
+
f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
|
1647
|
+
)
|
1648
|
+
else:
|
1649
|
+
confidence_intervals[class_idx] = (np.nan, np.nan)
|
1650
|
+
if verbose:
|
1651
|
+
print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
|
1604
1652
|
|
1605
1653
|
return confidence_intervals
|
1606
1654
|
|
@@ -2057,20 +2105,20 @@ def rank_models(
|
|
2057
2105
|
|
2058
2106
|
def generate_bar_plot(ax, cv_test_scores):
|
2059
2107
|
ax = plot.plotxy(
|
2060
|
-
y="Classifier", x="combined_score", data=cv_test_scores,
|
2108
|
+
y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
|
2061
2109
|
)
|
2062
2110
|
plt.title("Classifier Performance")
|
2063
2111
|
plt.tight_layout()
|
2064
2112
|
return plt
|
2065
2113
|
|
2066
|
-
nexttile = plot.subplot(2, 2, figsize=[10,
|
2114
|
+
nexttile = plot.subplot(2, 2, figsize=[10, 10])
|
2067
2115
|
generate_bar_plot(nexttile(), top_models.dropna())
|
2068
2116
|
plot.radar(
|
2069
2117
|
ax=nexttile(projection="polar"),
|
2070
2118
|
data=cv_test_scores.set_index("Classifier"),
|
2071
|
-
ylim=[0
|
2072
|
-
color=plot.get_color(
|
2073
|
-
alpha=0.
|
2119
|
+
ylim=[0, 1],
|
2120
|
+
color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
|
2121
|
+
alpha=0.02,
|
2074
2122
|
circular=1,
|
2075
2123
|
)
|
2076
2124
|
return cv_test_scores
|
@@ -2216,7 +2264,7 @@ def predict(
|
|
2216
2264
|
metrics: Optional[List[str]] = None,
|
2217
2265
|
stack:bool=True,# run stacking
|
2218
2266
|
stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
|
2219
|
-
vote:bool=
|
2267
|
+
vote:bool=False,# run voting
|
2220
2268
|
voting:str="hard", # only for classification purporse of voting
|
2221
2269
|
n_top_models:int=5, #for stacking models
|
2222
2270
|
n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
|
@@ -2229,7 +2277,12 @@ def predict(
|
|
2229
2277
|
cv_level: str = "l", # "s":'low',"m":'medium',"l":"high"
|
2230
2278
|
class_weight: str = "balanced",
|
2231
2279
|
random_state: int = 1,
|
2280
|
+
presets = "best_quality",# specific for autogluon
|
2281
|
+
time_limit=600, # specific for autogluon
|
2282
|
+
num_bag_folds=5, # specific for autogluon
|
2283
|
+
num_stack_levels=2, # specific for autogluon
|
2232
2284
|
verbose: bool = False,
|
2285
|
+
**kwargs
|
2233
2286
|
) -> pd.DataFrame:
|
2234
2287
|
"""
|
2235
2288
|
第一种情况是内部拆分,第二种是直接预测,第三种是外部验证。
|
@@ -2280,28 +2333,20 @@ def predict(
|
|
2280
2333
|
RandomForestRegressor,
|
2281
2334
|
ExtraTreesClassifier,
|
2282
2335
|
ExtraTreesRegressor,
|
2336
|
+
HistGradientBoostingRegressor,
|
2283
2337
|
BaggingClassifier,
|
2284
2338
|
BaggingRegressor,
|
2285
2339
|
AdaBoostClassifier,
|
2286
2340
|
AdaBoostRegressor,
|
2287
2341
|
)
|
2288
|
-
from sklearn.svm import SVC, SVR
|
2289
|
-
from sklearn.tree import DecisionTreeRegressor
|
2342
|
+
from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
|
2343
|
+
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
|
2290
2344
|
from sklearn.linear_model import (
|
2291
|
-
LogisticRegression,
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
RidgeClassifierCV,
|
2297
|
-
Perceptron,
|
2298
|
-
SGDClassifier,
|
2299
|
-
RidgeCV,
|
2300
|
-
Ridge,
|
2301
|
-
TheilSenRegressor,
|
2302
|
-
HuberRegressor,
|
2303
|
-
PoissonRegressor,
|
2304
|
-
|
2345
|
+
LogisticRegression,ElasticNet,ElasticNetCV,
|
2346
|
+
LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
|
2347
|
+
RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
|
2348
|
+
GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
|
2349
|
+
OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
|
2305
2350
|
)
|
2306
2351
|
from sklearn.compose import TransformedTargetRegressor
|
2307
2352
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
@@ -2318,10 +2363,16 @@ def predict(
|
|
2318
2363
|
)
|
2319
2364
|
from sklearn.preprocessing import PolynomialFeatures
|
2320
2365
|
from sklearn.model_selection import train_test_split
|
2321
|
-
|
2366
|
+
|
2367
|
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
2368
|
+
from sklearn.kernel_ridge import KernelRidge
|
2369
|
+
from sklearn.dummy import DummyRegressor
|
2370
|
+
from autogluon.tabular import TabularPredictor
|
2322
2371
|
# 拼写检查
|
2323
2372
|
purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
|
2324
2373
|
print(f"{purpose} processing...")
|
2374
|
+
|
2375
|
+
|
2325
2376
|
# Default models or regressors if not provided
|
2326
2377
|
if purpose == "classification":
|
2327
2378
|
model_ = {
|
@@ -2374,27 +2425,44 @@ def predict(
|
|
2374
2425
|
model_ = {
|
2375
2426
|
"Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
|
2376
2427
|
"SVM": SVR(), # SVR (Support Vector Regression)
|
2377
|
-
|
2378
|
-
"LassoCV": LassoCV(
|
2379
|
-
cv=cv_folds, random_state=random_state,n_jobs=n_jobs
|
2380
|
-
), # LassoCV自动找出最适alpha,优于Lasso
|
2428
|
+
"LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs), # LassoCV自动找出最适alpha,优于Lasso
|
2381
2429
|
"Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
|
2382
2430
|
"XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
|
2383
2431
|
"Linear Regression": LinearRegression(n_jobs=n_jobs),
|
2384
2432
|
"AdaBoost": AdaBoostRegressor(random_state=random_state),
|
2385
|
-
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,
|
2386
|
-
force_row_wise=True # Or use force_col_wise=True if memory is a concern
|
2387
|
-
),
|
2433
|
+
"LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True), # Or use force_col_wise=True if memory is a concern
|
2388
2434
|
"CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
|
2389
2435
|
"Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
|
2390
2436
|
"Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
|
2391
2437
|
"Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
|
2392
2438
|
"ElasticNet": ElasticNet(random_state=random_state),
|
2393
|
-
"Ridge": Ridge(),
|
2439
|
+
"Ridge": Ridge(random_state=random_state),
|
2394
2440
|
"KNN": KNeighborsRegressor(n_jobs=n_jobs),
|
2395
2441
|
"TheilSen":TheilSenRegressor(n_jobs=n_jobs),
|
2396
2442
|
"Huber":HuberRegressor(),
|
2397
|
-
"Poisson":PoissonRegressor()
|
2443
|
+
"Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
|
2444
|
+
"Lasso": Lasso(random_state=random_state),
|
2445
|
+
"Lars": Lars(),
|
2446
|
+
"LassoLars": LassoLars(),
|
2447
|
+
"BayesianRidge": BayesianRidge(),
|
2448
|
+
"GammaRegressor": GammaRegressor(),
|
2449
|
+
"TweedieRegressor": TweedieRegressor(),
|
2450
|
+
"LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
|
2451
|
+
"ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
|
2452
|
+
"LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
|
2453
|
+
"LarsCV": LarsCV(),
|
2454
|
+
"OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
|
2455
|
+
"OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
|
2456
|
+
"PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
|
2457
|
+
"LinearSVR": LinearSVR(random_state=random_state),
|
2458
|
+
"NuSVR": NuSVR(),
|
2459
|
+
"DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
|
2460
|
+
"ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
|
2461
|
+
"HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
|
2462
|
+
"GaussianProcessRegressor": GaussianProcessRegressor(),
|
2463
|
+
"KernelRidge": KernelRidge(),
|
2464
|
+
"DummyRegressor": DummyRegressor(),
|
2465
|
+
"TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
|
2398
2466
|
}
|
2399
2467
|
if cls is None:
|
2400
2468
|
models = model_
|
@@ -2411,10 +2479,17 @@ def predict(
|
|
2411
2479
|
ips.df_special_characters_cleaner(x_true) if x_true is not None else None
|
2412
2480
|
)
|
2413
2481
|
|
2482
|
+
# only keep "autogluon_tab" in models
|
2483
|
+
cls = [cls] if isinstance(cls, str) else cls
|
2484
|
+
|
2485
|
+
if cls is not None:
|
2486
|
+
models={"autogluon_tab":None} if "auto" in cls else models
|
2487
|
+
|
2414
2488
|
# indicate cls:
|
2415
2489
|
if ips.run_once_within(30): # 10 min
|
2416
2490
|
print(f"processing: {list(models.keys())}")
|
2417
|
-
|
2491
|
+
y_train_col_name=None
|
2492
|
+
# print(isinstance(y_train, str) and y_train in x_train.columns)
|
2418
2493
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2419
2494
|
y_train_col_name = y_train
|
2420
2495
|
y_train = x_train[y_train]
|
@@ -2531,19 +2606,261 @@ def predict(
|
|
2531
2606
|
if isinstance(y_train, np.ndarray):
|
2532
2607
|
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2533
2608
|
y_true = np.asarray(y_true)
|
2609
|
+
#! so far, got the: x_train,x_true,y_train,y_true
|
2610
|
+
# Grid search with KFold or StratifiedKFold
|
2611
|
+
if "autogluon_tab" in models:
|
2612
|
+
# load hypoer_param
|
2613
|
+
f_param = os.path.dirname(os.path.abspath(__file__))
|
2614
|
+
f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
|
2615
|
+
with open(f_param, "r") as file:
|
2616
|
+
hyper_param_autogluon = json.load(file)
|
2617
|
+
# Train the model with AutoGluon
|
2618
|
+
features=x_train.columns.tolist()
|
2619
|
+
label= y_train_col_name if y_train_col_name is not None else 'target'
|
2620
|
+
df_autogluon = x_train.copy()
|
2621
|
+
df_autogluon[label]=y_train
|
2622
|
+
autogluon_presets=["best_quality","good_quality","fast_train"]
|
2623
|
+
best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
|
2624
|
+
train_data=df_autogluon,
|
2625
|
+
presets=ips.strcmp(presets, autogluon_presets)[0], # 'best_quality' or 'good_quality' or 'fast_train'
|
2626
|
+
time_limit=time_limit,#3600, # in sec: Limit training time,
|
2627
|
+
num_bag_folds=num_bag_folds,
|
2628
|
+
num_stack_levels=num_stack_levels,
|
2629
|
+
hyperparameters=hyper_param_autogluon,
|
2630
|
+
verbosity=1 if verbose else 0,
|
2631
|
+
**kwargs
|
2632
|
+
)
|
2633
|
+
#! Get the leaderboard
|
2634
|
+
gs={}
|
2635
|
+
# Display the leaderboard for reference
|
2636
|
+
leaderboard = best_clf.leaderboard()
|
2637
|
+
gs['info']=best_clf.info()
|
2638
|
+
# gs["res"]=best_clf
|
2639
|
+
gs["features"]=features
|
2640
|
+
gs["leaderboard"] = leaderboard
|
2641
|
+
best_model_name = leaderboard.iloc[0, 0] # First row, first column contains the model name
|
2642
|
+
# Store the best model and its details in the gs dictionary
|
2643
|
+
gs["best_estimator_"] = best_model_name # Store the trained model, not just the name
|
2644
|
+
gs["best_params_"] = best_model_name # Hyperparameters
|
2645
|
+
# Make predictions if x_true is provided
|
2646
|
+
if x_true is not None:
|
2647
|
+
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2648
|
+
gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
|
2649
|
+
gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
|
2650
|
+
x_true[label]=gs["predictions"]
|
2651
|
+
if gs["predictions"].value_counts().shape[0]>1:
|
2652
|
+
gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
|
2653
|
+
gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
|
2654
|
+
all_models = gs["models"]
|
2655
|
+
model_evaluations = {}
|
2656
|
+
for model in all_models:
|
2657
|
+
predictions = best_clf.predict(x_true[features], model=model)
|
2658
|
+
evaluation = best_clf.evaluate_predictions(
|
2659
|
+
y_true=x_true[label], # True labels
|
2660
|
+
y_pred=predictions, # Predictions from the specific model
|
2661
|
+
auxiliary_metrics=True, # Include additional metrics if needed
|
2662
|
+
)
|
2663
|
+
model_evaluations[model] = evaluation
|
2664
|
+
gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
|
2665
|
+
#! 试着保持一样的格式
|
2666
|
+
results = {}
|
2667
|
+
for model in all_models:
|
2668
|
+
y_pred = best_clf.predict(x_true[features], model=model).tolist()
|
2669
|
+
y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
|
2670
|
+
|
2671
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
2672
|
+
y_pred_proba=y_pred_proba.iloc[:,1]
|
2673
|
+
|
2674
|
+
# try to make predict format consistant
|
2675
|
+
try:
|
2676
|
+
y_pred= [i[0] for i in y_pred]
|
2677
|
+
except:
|
2678
|
+
pass
|
2679
|
+
try:
|
2680
|
+
y_true= [i[0] for i in y_true]
|
2681
|
+
except:
|
2682
|
+
pass
|
2683
|
+
try:
|
2684
|
+
y_train= [i[0] for i in y_train]
|
2685
|
+
except:
|
2686
|
+
pass
|
2687
|
+
validation_scores = {}
|
2688
|
+
if y_true is not None and y_pred_proba is not None:
|
2689
|
+
validation_scores = cal_metrics(
|
2690
|
+
y_true,
|
2691
|
+
y_pred,
|
2692
|
+
y_pred_proba=y_pred_proba,
|
2693
|
+
is_binary=is_binary,
|
2694
|
+
purpose=purpose,
|
2695
|
+
average="weighted",
|
2696
|
+
)
|
2697
|
+
if is_binary:
|
2698
|
+
# Calculate ROC curve
|
2699
|
+
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
|
2700
|
+
if y_pred_proba is not None:
|
2701
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2702
|
+
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2703
|
+
lower_ci, upper_ci = cal_auc_ci(
|
2704
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2705
|
+
)
|
2706
|
+
roc_auc = auc(fpr, tpr)
|
2707
|
+
roc_info = {
|
2708
|
+
"fpr": fpr.tolist(),
|
2709
|
+
"tpr": tpr.tolist(),
|
2710
|
+
"auc": roc_auc,
|
2711
|
+
"ci95": (lower_ci, upper_ci),
|
2712
|
+
}
|
2713
|
+
# precision-recall curve
|
2714
|
+
precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
|
2715
|
+
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
2716
|
+
pr_info = {
|
2717
|
+
"precision": precision_,
|
2718
|
+
"recall": recall_,
|
2719
|
+
"avg_precision": avg_precision_,
|
2720
|
+
}
|
2721
|
+
else:
|
2722
|
+
roc_info, pr_info = None, None
|
2723
|
+
if purpose == "classification":
|
2724
|
+
results[model] = {
|
2725
|
+
# "best_clf": gs.best_estimator_,
|
2726
|
+
# "best_params": gs.best_params_,
|
2727
|
+
# "auc_indiv": [
|
2728
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2729
|
+
# for i in range(cv_folds)
|
2730
|
+
# ],
|
2731
|
+
"scores": validation_scores,
|
2732
|
+
"roc_curve": roc_info,
|
2733
|
+
"pr_curve": pr_info,
|
2734
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2735
|
+
"predictions": y_pred,#.tolist(),
|
2736
|
+
"predictions_proba": (
|
2737
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2738
|
+
),
|
2739
|
+
"features":features,
|
2740
|
+
# "coef":coef_,
|
2741
|
+
# "alphas":alphas_
|
2742
|
+
}
|
2743
|
+
else: # "regression"
|
2744
|
+
results[model] = {
|
2745
|
+
# "best_clf": gs.best_estimator_,
|
2746
|
+
# "best_params": gs.best_params_,
|
2747
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2748
|
+
"predictions": y_pred,#.tolist(),
|
2749
|
+
"predictions_proba": (
|
2750
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2751
|
+
),
|
2752
|
+
"features":features,
|
2753
|
+
# "coef":coef_,
|
2754
|
+
# "alphas":alphas_
|
2755
|
+
}
|
2756
|
+
else: # multi-classes
|
2757
|
+
if y_pred_proba is not None:
|
2758
|
+
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2759
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2760
|
+
confidence_intervals = cal_auc_ci(
|
2761
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
2762
|
+
)
|
2763
|
+
roc_info = {
|
2764
|
+
"fpr": validation_scores["fpr"],
|
2765
|
+
"tpr": validation_scores["tpr"],
|
2766
|
+
"auc": validation_scores["roc_auc_by_class"],
|
2767
|
+
"ci95": confidence_intervals,
|
2768
|
+
}
|
2769
|
+
# precision-recall curve
|
2770
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
2771
|
+
y_true, y_pred_proba, is_binary=is_binary
|
2772
|
+
)
|
2773
|
+
pr_info = {
|
2774
|
+
"precision": precision_,
|
2775
|
+
"recall": recall_,
|
2776
|
+
"avg_precision": avg_precision_,
|
2777
|
+
}
|
2778
|
+
else:
|
2779
|
+
roc_info, pr_info = None, None
|
2780
|
+
|
2781
|
+
if purpose == "classification":
|
2782
|
+
results[model] = {
|
2783
|
+
# "best_clf": gs.best_estimator_,
|
2784
|
+
# "best_params": gs.best_params_,
|
2785
|
+
# "auc_indiv": [
|
2786
|
+
# gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
|
2787
|
+
# for i in range(cv_folds)
|
2788
|
+
# ],
|
2789
|
+
"scores": validation_scores,
|
2790
|
+
"roc_curve": roc_info,
|
2791
|
+
"pr_curve": pr_info,
|
2792
|
+
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
2793
|
+
"predictions": y_pred,#.tolist(),
|
2794
|
+
"predictions_proba": (
|
2795
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2796
|
+
),
|
2797
|
+
"features":features,
|
2798
|
+
# "coef":coef_,
|
2799
|
+
# "alphas":alphas_
|
2800
|
+
}
|
2801
|
+
else: # "regression"
|
2802
|
+
results[model] = {
|
2803
|
+
# "best_clf": gs.best_estimator_,
|
2804
|
+
# "best_params": gs.best_params_,
|
2805
|
+
"scores": validation_scores, # e.g., neg_MSE, R², etc.
|
2806
|
+
"predictions": y_pred,#.tolist(),
|
2807
|
+
"predictions_proba": (
|
2808
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2809
|
+
),
|
2810
|
+
"features":features,
|
2811
|
+
# "coef":coef_,
|
2812
|
+
# "alphas":alphas_
|
2813
|
+
}
|
2534
2814
|
|
2535
|
-
|
2536
|
-
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2541
|
-
|
2542
|
-
|
2543
|
-
|
2544
|
-
|
2545
|
-
|
2815
|
+
else:
|
2816
|
+
if y_true is None:
|
2817
|
+
validation_scores = []
|
2818
|
+
else:
|
2819
|
+
validation_scores = cal_metrics(
|
2820
|
+
y_true,
|
2821
|
+
y_pred,
|
2822
|
+
y_pred_proba=y_pred_proba,
|
2823
|
+
is_binary=is_binary,
|
2824
|
+
purpose=purpose,
|
2825
|
+
average="weighted",
|
2826
|
+
)
|
2827
|
+
results[model] = {
|
2828
|
+
# "best_clf": gs.best_estimator_,
|
2829
|
+
# "best_params": gs.best_params_,
|
2830
|
+
"scores": validation_scores,
|
2831
|
+
"predictions": y_pred,#.tolist(),
|
2832
|
+
"predictions_proba": (
|
2833
|
+
y_pred_proba.tolist() if y_pred_proba is not None else None
|
2834
|
+
),
|
2835
|
+
"features":features,
|
2836
|
+
"y_train": y_train if y_train is not None else [],
|
2837
|
+
"y_true": y_true if y_true is not None else [],
|
2838
|
+
# "coef":coef_,
|
2839
|
+
# "alphas":alphas_
|
2546
2840
|
}
|
2841
|
+
df_results = pd.DataFrame.from_dict(results, orient="index")
|
2842
|
+
gs['res']=df_results
|
2843
|
+
|
2844
|
+
if all([plot_, y_true is not None, purpose == "classification"]):
|
2845
|
+
from datetime import datetime
|
2846
|
+
|
2847
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
2848
|
+
# try:
|
2849
|
+
if df_results.shape[0] > 3:
|
2850
|
+
try:
|
2851
|
+
plot_validate_features(df_results, is_binary=is_binary)
|
2852
|
+
except Exception as e:
|
2853
|
+
print(e)
|
2854
|
+
else:
|
2855
|
+
try:
|
2856
|
+
plot_validate_features_single(df_results, is_binary=is_binary)
|
2857
|
+
except Exception as e:
|
2858
|
+
print(e)
|
2859
|
+
if dir_save:
|
2860
|
+
ips.figsave(dir_save + f"validate_features{now_}.pdf")
|
2861
|
+
return gs
|
2862
|
+
|
2863
|
+
#! cross_valid
|
2547
2864
|
if cv_level in ["low", "simple", "s", "l"]:
|
2548
2865
|
param_grids = {
|
2549
2866
|
"Random Forest": (
|
@@ -2712,7 +3029,73 @@ def predict(
|
|
2712
3029
|
'alpha': [0.1],
|
2713
3030
|
'max_iter': [100],},
|
2714
3031
|
"Poisson":{'alpha': [0.1],
|
2715
|
-
'max_iter': [100],}
|
3032
|
+
'max_iter': [100],},
|
3033
|
+
"Lars": {"n_nonzero_coefs": [10, 50, None]},
|
3034
|
+
"LassoLars": {
|
3035
|
+
"alpha": [0.01, 0.1, 1]
|
3036
|
+
},
|
3037
|
+
"BayesianRidge": {
|
3038
|
+
"alpha_1": [1e-6, 1e-4, 1e-2],
|
3039
|
+
"lambda_1": [1e-6, 1e-4, 1e-2]
|
3040
|
+
},
|
3041
|
+
"GammaRegressor": {
|
3042
|
+
"alpha": [0.1, 1, 10]
|
3043
|
+
},
|
3044
|
+
"TweedieRegressor": {
|
3045
|
+
"alpha": [0.1, 1, 10],
|
3046
|
+
"power": [1, 1.5, 2]
|
3047
|
+
},
|
3048
|
+
"LassoCV": {
|
3049
|
+
"cv": [5]
|
3050
|
+
},
|
3051
|
+
"ElasticNetCV": {
|
3052
|
+
"l1_ratio": [0.2, 0.5, 0.8],
|
3053
|
+
"cv": [5]
|
3054
|
+
},
|
3055
|
+
"LassoLarsCV": {
|
3056
|
+
"cv": [5]
|
3057
|
+
},
|
3058
|
+
"LarsCV": {
|
3059
|
+
"cv": [5]
|
3060
|
+
},
|
3061
|
+
"OrthogonalMatchingPursuit": {
|
3062
|
+
"n_nonzero_coefs": [10, 50, None]
|
3063
|
+
},
|
3064
|
+
"OrthogonalMatchingPursuitCV": {
|
3065
|
+
"cv": [5]
|
3066
|
+
},
|
3067
|
+
"PassiveAggressiveRegressor": {
|
3068
|
+
"C": [0.1, 1, 10]
|
3069
|
+
},
|
3070
|
+
"LinearSVR": {
|
3071
|
+
"C": [0.1, 1, 10]
|
3072
|
+
},
|
3073
|
+
"NuSVR": {
|
3074
|
+
"C": [0.1, 1, 10]
|
3075
|
+
},
|
3076
|
+
"DecisionTreeRegressor": {
|
3077
|
+
"max_depth": [5, 10, None]
|
3078
|
+
},
|
3079
|
+
"ExtraTreeRegressor": {
|
3080
|
+
"max_depth": [5, 10, None]
|
3081
|
+
},
|
3082
|
+
"HistGradientBoostingRegressor": {
|
3083
|
+
"learning_rate": [0.05, 0.1, 0.2],
|
3084
|
+
"max_depth": [5, 10, None]
|
3085
|
+
},
|
3086
|
+
"GaussianProcessRegressor": {
|
3087
|
+
"alpha": [1e-5, 1e-2, 0.1]
|
3088
|
+
},
|
3089
|
+
"KernelRidge": {
|
3090
|
+
"alpha": [0.1, 1, 10],
|
3091
|
+
"kernel": ["linear", "rbf"]
|
3092
|
+
},
|
3093
|
+
"DummyRegressor": {
|
3094
|
+
"strategy": ["mean", "median"]
|
3095
|
+
},
|
3096
|
+
"TransformedTargetRegressor": {
|
3097
|
+
"regressor__fit_intercept": [True, False]
|
3098
|
+
}
|
2716
3099
|
}
|
2717
3100
|
elif cv_level in ["high", "advanced", "h"]:
|
2718
3101
|
param_grids = {
|
@@ -2917,7 +3300,96 @@ def predict(
|
|
2917
3300
|
'alpha': [0.1, 1.0, 10.0],
|
2918
3301
|
'max_iter': [100, 200, 300],},
|
2919
3302
|
"Poisson":{'alpha': [0.1, 1.0, 10.0],
|
2920
|
-
'max_iter': [100, 200, 300],}
|
3303
|
+
'max_iter': [100, 200, 300],},
|
3304
|
+
"Lars": {
|
3305
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3306
|
+
},
|
3307
|
+
"LassoLars": {
|
3308
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3309
|
+
},
|
3310
|
+
"BayesianRidge": {
|
3311
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3312
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3313
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3314
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3315
|
+
},
|
3316
|
+
"GammaRegressor": {
|
3317
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3318
|
+
"max_iter": [1000, 5000, 10000]
|
3319
|
+
},
|
3320
|
+
"TweedieRegressor": {
|
3321
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3322
|
+
"power": [0, 1, 1.5, 2, 3]
|
3323
|
+
},
|
3324
|
+
"LassoCV": {
|
3325
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3326
|
+
"cv": [3, 5, 10]
|
3327
|
+
},
|
3328
|
+
"ElasticNetCV": {
|
3329
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3330
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3331
|
+
"cv": [3, 5, 10]
|
3332
|
+
},
|
3333
|
+
"LassoLarsCV": {
|
3334
|
+
"cv": [3, 5, 10]
|
3335
|
+
},
|
3336
|
+
"LarsCV": {
|
3337
|
+
"cv": [3, 5, 10]
|
3338
|
+
},
|
3339
|
+
"OrthogonalMatchingPursuit": {
|
3340
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3341
|
+
},
|
3342
|
+
"OrthogonalMatchingPursuitCV": {
|
3343
|
+
"cv": [3, 5, 10]
|
3344
|
+
},
|
3345
|
+
"PassiveAggressiveRegressor": {
|
3346
|
+
"C": [0.01, 0.1, 1, 10],
|
3347
|
+
"max_iter": [1000, 5000, 10000],
|
3348
|
+
"early_stopping": [True, False]
|
3349
|
+
},
|
3350
|
+
"LinearSVR": {
|
3351
|
+
"C": [0.01, 0.1, 1, 10],
|
3352
|
+
"epsilon": [0.01, 0.1, 1],
|
3353
|
+
"max_iter": [1000, 5000, 10000]
|
3354
|
+
},
|
3355
|
+
"NuSVR": {
|
3356
|
+
"C": [0.01, 0.1, 1, 10],
|
3357
|
+
"nu": [0.25, 0.5, 0.75],
|
3358
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3359
|
+
},
|
3360
|
+
"DecisionTreeRegressor": {
|
3361
|
+
"max_depth": [None, 5, 10, 20],
|
3362
|
+
"min_samples_split": [2, 5, 10],
|
3363
|
+
"min_samples_leaf": [1, 2, 4]
|
3364
|
+
},
|
3365
|
+
"ExtraTreeRegressor": {
|
3366
|
+
"max_depth": [None, 5, 10, 20],
|
3367
|
+
"min_samples_split": [2, 5, 10],
|
3368
|
+
"min_samples_leaf": [1, 2, 4]
|
3369
|
+
},
|
3370
|
+
"HistGradientBoostingRegressor": {
|
3371
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3372
|
+
"max_iter": [100, 500, 1000],
|
3373
|
+
"max_depth": [None, 5, 10, 20],
|
3374
|
+
"min_samples_leaf": [1, 2, 4]
|
3375
|
+
},
|
3376
|
+
"GaussianProcessRegressor": {
|
3377
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3378
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3379
|
+
},
|
3380
|
+
"KernelRidge": {
|
3381
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3382
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3383
|
+
"degree": [2, 3, 4]
|
3384
|
+
},
|
3385
|
+
"DummyRegressor": {
|
3386
|
+
"strategy": ["mean", "median", "constant"],
|
3387
|
+
"constant": [0] # Only if strategy is 'constant'
|
3388
|
+
},
|
3389
|
+
"TransformedTargetRegressor": {
|
3390
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3391
|
+
"regressor__fit_intercept": [True, False]
|
3392
|
+
}
|
2921
3393
|
}
|
2922
3394
|
else: # median level
|
2923
3395
|
param_grids = {
|
@@ -3164,7 +3636,96 @@ def predict(
|
|
3164
3636
|
'alpha': [0.1, 1.0],
|
3165
3637
|
'max_iter': [100, 200],},
|
3166
3638
|
"Poisson":{'alpha': [0.1, 1.0],
|
3167
|
-
'max_iter': [100, 200],}
|
3639
|
+
'max_iter': [100, 200],},
|
3640
|
+
"Lars": {
|
3641
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3642
|
+
},
|
3643
|
+
"LassoLars": {
|
3644
|
+
"alpha": [0.001, 0.01, 0.1, 1, 10]
|
3645
|
+
},
|
3646
|
+
"BayesianRidge": {
|
3647
|
+
"alpha_1": [1e-6, 1e-5, 1e-4],
|
3648
|
+
"alpha_2": [1e-6, 1e-5, 1e-4],
|
3649
|
+
"lambda_1": [1e-6, 1e-5, 1e-4],
|
3650
|
+
"lambda_2": [1e-6, 1e-5, 1e-4]
|
3651
|
+
},
|
3652
|
+
"GammaRegressor": {
|
3653
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3654
|
+
"max_iter": [1000, 5000, 10000]
|
3655
|
+
},
|
3656
|
+
"TweedieRegressor": {
|
3657
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3658
|
+
"power": [0, 1, 1.5, 2, 3]
|
3659
|
+
},
|
3660
|
+
"LassoCV": {
|
3661
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3662
|
+
"cv": [3, 5, 10]
|
3663
|
+
},
|
3664
|
+
"ElasticNetCV": {
|
3665
|
+
"l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
|
3666
|
+
"alphas": [[0.001, 0.01, 0.1, 1, 10]],
|
3667
|
+
"cv": [3, 5, 10]
|
3668
|
+
},
|
3669
|
+
"LassoLarsCV": {
|
3670
|
+
"cv": [3, 5, 10]
|
3671
|
+
},
|
3672
|
+
"LarsCV": {
|
3673
|
+
"cv": [3, 5, 10]
|
3674
|
+
},
|
3675
|
+
"OrthogonalMatchingPursuit": {
|
3676
|
+
"n_nonzero_coefs": [10, 50, 100, 200, None]
|
3677
|
+
},
|
3678
|
+
"OrthogonalMatchingPursuitCV": {
|
3679
|
+
"cv": [3, 5, 10]
|
3680
|
+
},
|
3681
|
+
"PassiveAggressiveRegressor": {
|
3682
|
+
"C": [0.01, 0.1, 1, 10],
|
3683
|
+
"max_iter": [1000, 5000, 10000],
|
3684
|
+
"early_stopping": [True, False]
|
3685
|
+
},
|
3686
|
+
"LinearSVR": {
|
3687
|
+
"C": [0.01, 0.1, 1, 10],
|
3688
|
+
"epsilon": [0.01, 0.1, 1],
|
3689
|
+
"max_iter": [1000, 5000, 10000]
|
3690
|
+
},
|
3691
|
+
"NuSVR": {
|
3692
|
+
"C": [0.01, 0.1, 1, 10],
|
3693
|
+
"nu": [0.25, 0.5, 0.75],
|
3694
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"]
|
3695
|
+
},
|
3696
|
+
"DecisionTreeRegressor": {
|
3697
|
+
"max_depth": [None, 5, 10, 20],
|
3698
|
+
"min_samples_split": [2, 5, 10],
|
3699
|
+
"min_samples_leaf": [1, 2, 4]
|
3700
|
+
},
|
3701
|
+
"ExtraTreeRegressor": {
|
3702
|
+
"max_depth": [None, 5, 10, 20],
|
3703
|
+
"min_samples_split": [2, 5, 10],
|
3704
|
+
"min_samples_leaf": [1, 2, 4]
|
3705
|
+
},
|
3706
|
+
"HistGradientBoostingRegressor": {
|
3707
|
+
"learning_rate": [0.01, 0.1, 0.2],
|
3708
|
+
"max_iter": [100, 500, 1000],
|
3709
|
+
"max_depth": [None, 5, 10, 20],
|
3710
|
+
"min_samples_leaf": [1, 2, 4]
|
3711
|
+
},
|
3712
|
+
"GaussianProcessRegressor": {
|
3713
|
+
"alpha": [1e-10, 1e-5, 1e-2, 0.1],
|
3714
|
+
"n_restarts_optimizer": [0, 1, 5, 10]
|
3715
|
+
},
|
3716
|
+
"KernelRidge": {
|
3717
|
+
"alpha": [0.01, 0.1, 1, 10],
|
3718
|
+
"kernel": ["linear", "poly", "rbf", "sigmoid"],
|
3719
|
+
"degree": [2, 3, 4]
|
3720
|
+
},
|
3721
|
+
"DummyRegressor": {
|
3722
|
+
"strategy": ["mean", "median", "constant"],
|
3723
|
+
"constant": [0] # Only if strategy is 'constant'
|
3724
|
+
},
|
3725
|
+
"TransformedTargetRegressor": {
|
3726
|
+
# Grid for the underlying regressor, example shown for LinearRegression
|
3727
|
+
"regressor__fit_intercept": [True, False]
|
3728
|
+
}
|
3168
3729
|
}
|
3169
3730
|
|
3170
3731
|
results = {}
|
@@ -3174,7 +3735,7 @@ def predict(
|
|
3174
3735
|
if purpose == "classification"
|
3175
3736
|
else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
|
3176
3737
|
)
|
3177
|
-
|
3738
|
+
|
3178
3739
|
# Train and validate each model
|
3179
3740
|
for name, clf in tqdm(
|
3180
3741
|
models.items(),
|
@@ -3185,8 +3746,7 @@ def predict(
|
|
3185
3746
|
if verbose:
|
3186
3747
|
print(f"\nTraining and validating {name}:")
|
3187
3748
|
try:
|
3188
|
-
|
3189
|
-
if is_binary:
|
3749
|
+
if is_binary:
|
3190
3750
|
gs = GridSearchCV(
|
3191
3751
|
clf,
|
3192
3752
|
param_grid=param_grids.get(name, {}),
|
@@ -3202,6 +3762,7 @@ def predict(
|
|
3202
3762
|
|
3203
3763
|
gs.fit(x_train, y_train)
|
3204
3764
|
best_clf = gs.best_estimator_
|
3765
|
+
|
3205
3766
|
# make sure x_train and x_test has the same name
|
3206
3767
|
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
3207
3768
|
y_pred = best_clf.predict(x_true)
|
@@ -3212,7 +3773,14 @@ def predict(
|
|
3212
3773
|
y_pred_proba = np.hstack(
|
3213
3774
|
[1 - y_pred_proba, y_pred_proba]
|
3214
3775
|
) # Add missing class probabilities
|
3215
|
-
|
3776
|
+
if y_pred_proba.shape[1] == 2:
|
3777
|
+
if isinstance(y_pred_proba, pd.DataFrame):
|
3778
|
+
y_pred_proba = y_pred_proba.iloc[:, 1]
|
3779
|
+
elif isinstance(y_pred_proba, pd.Series):
|
3780
|
+
y_pred_proba = y_pred_proba.values[:, 1]
|
3781
|
+
else:
|
3782
|
+
y_pred_proba = y_pred_proba[:, 1]
|
3783
|
+
|
3216
3784
|
elif hasattr(best_clf, "decision_function"):
|
3217
3785
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
3218
3786
|
y_pred_proba = best_clf.decision_function(x_true)
|
@@ -3233,7 +3801,7 @@ def predict(
|
|
3233
3801
|
else:
|
3234
3802
|
alphas_= None
|
3235
3803
|
coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
|
3236
|
-
else:
|
3804
|
+
else:
|
3237
3805
|
gs = GridSearchCV(
|
3238
3806
|
clf,
|
3239
3807
|
param_grid=param_grids.get(name, {}),
|
@@ -3289,6 +3857,7 @@ def predict(
|
|
3289
3857
|
alphas_,coef_ = None,None
|
3290
3858
|
print(f"skiped {clf}: {e}")
|
3291
3859
|
continue
|
3860
|
+
|
3292
3861
|
# try to make predict format consistant
|
3293
3862
|
try:
|
3294
3863
|
y_pred= [i[0] for i in y_pred]
|
@@ -3460,7 +4029,8 @@ def predict(
|
|
3460
4029
|
|
3461
4030
|
# Convert results to DataFrame
|
3462
4031
|
df_results = pd.DataFrame.from_dict(results, orient="index")
|
3463
|
-
|
4032
|
+
display(df_results)
|
4033
|
+
# sort
|
3464
4034
|
if y_true is not None:
|
3465
4035
|
if purpose == "classification":
|
3466
4036
|
df_scores = pd.DataFrame(
|
@@ -4028,6 +4598,8 @@ def predict(
|
|
4028
4598
|
df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
|
4029
4599
|
elif stack:
|
4030
4600
|
df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
|
4601
|
+
else:
|
4602
|
+
df_res=df_results
|
4031
4603
|
|
4032
4604
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
4033
4605
|
from datetime import datetime
|