py2ls 0.2.4.18__py3-none-any.whl → 0.2.4.20__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py
CHANGED
@@ -702,7 +702,7 @@ def get_features(
|
|
702
702
|
"AdaBoost",
|
703
703
|
]
|
704
704
|
cls = [ips.strcmp(i, cls_)[0] for i in cls]
|
705
|
-
|
705
|
+
|
706
706
|
feature_importances = {}
|
707
707
|
|
708
708
|
# Lasso Feature Selection
|
@@ -714,7 +714,7 @@ def get_features(
|
|
714
714
|
lasso_selected_features = (
|
715
715
|
lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
|
716
716
|
)
|
717
|
-
feature_importances[
|
717
|
+
feature_importances["lasso"] = lasso_importances.head(n_features)
|
718
718
|
# Ridge
|
719
719
|
ridge_importances = (
|
720
720
|
features_ridge(x_train, y_train, ridge_params)
|
@@ -724,7 +724,7 @@ def get_features(
|
|
724
724
|
selected_ridge_features = (
|
725
725
|
ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
|
726
726
|
)
|
727
|
-
feature_importances[
|
727
|
+
feature_importances["ridge"] = ridge_importances.head(n_features)
|
728
728
|
# Elastic Net
|
729
729
|
enet_importances = (
|
730
730
|
features_enet(x_train, y_train, enet_params)
|
@@ -734,7 +734,7 @@ def get_features(
|
|
734
734
|
selected_enet_features = (
|
735
735
|
enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
|
736
736
|
)
|
737
|
-
feature_importances[
|
737
|
+
feature_importances["Enet"] = enet_importances.head(n_features)
|
738
738
|
# Random Forest Feature Importance
|
739
739
|
rf_importances = (
|
740
740
|
features_rf(x_train, y_train, rf_params)
|
@@ -746,7 +746,7 @@ def get_features(
|
|
746
746
|
if "Random Forest" in cls
|
747
747
|
else []
|
748
748
|
)
|
749
|
-
feature_importances[
|
749
|
+
feature_importances["Random Forest"] = rf_importances.head(n_features)
|
750
750
|
# Gradient Boosting Feature Importance
|
751
751
|
gb_importances = (
|
752
752
|
features_gradient_boosting(x_train, y_train, gb_params)
|
@@ -758,7 +758,7 @@ def get_features(
|
|
758
758
|
if "Gradient Boosting" in cls
|
759
759
|
else []
|
760
760
|
)
|
761
|
-
feature_importances[
|
761
|
+
feature_importances["Gradient Boosting"] = gb_importances.head(n_features)
|
762
762
|
# xgb
|
763
763
|
xgb_importances = (
|
764
764
|
features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
|
@@ -766,7 +766,7 @@ def get_features(
|
|
766
766
|
top_xgb_features = (
|
767
767
|
xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
|
768
768
|
)
|
769
|
-
feature_importances[
|
769
|
+
feature_importances["xgb"] = xgb_importances.head(n_features)
|
770
770
|
|
771
771
|
# SVM with RFE
|
772
772
|
selected_svm_features = (
|
@@ -781,7 +781,7 @@ def get_features(
|
|
781
781
|
selected_lda_features = (
|
782
782
|
lda_importances.head(n_features)["feature"].values if "lda" in cls else []
|
783
783
|
)
|
784
|
-
feature_importances[
|
784
|
+
feature_importances["lda"] = lda_importances.head(n_features)
|
785
785
|
# AdaBoost Feature Importance
|
786
786
|
adaboost_importances = (
|
787
787
|
features_adaboost(x_train, y_train, adaboost_params)
|
@@ -793,7 +793,7 @@ def get_features(
|
|
793
793
|
if "AdaBoost" in cls
|
794
794
|
else []
|
795
795
|
)
|
796
|
-
feature_importances[
|
796
|
+
feature_importances["AdaBoost"] = adaboost_importances.head(n_features)
|
797
797
|
# Decision Tree Feature Importance
|
798
798
|
dt_importances = (
|
799
799
|
features_decision_tree(x_train, y_train, dt_params)
|
@@ -804,8 +804,8 @@ def get_features(
|
|
804
804
|
dt_importances.head(n_features)["feature"].values
|
805
805
|
if "Decision Tree" in cls
|
806
806
|
else []
|
807
|
-
)
|
808
|
-
feature_importances[
|
807
|
+
)
|
808
|
+
feature_importances["Decision Tree"] = dt_importances.head(n_features)
|
809
809
|
# Bagging Feature Importance
|
810
810
|
bagging_importances = (
|
811
811
|
features_bagging(x_train, y_train, bagging_params)
|
@@ -817,7 +817,7 @@ def get_features(
|
|
817
817
|
if "Bagging" in cls
|
818
818
|
else []
|
819
819
|
)
|
820
|
-
feature_importances[
|
820
|
+
feature_importances["Bagging"] = bagging_importances.head(n_features)
|
821
821
|
# KNN Feature Importance via Permutation
|
822
822
|
knn_importances = (
|
823
823
|
features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
|
@@ -825,7 +825,7 @@ def get_features(
|
|
825
825
|
top_knn_features = (
|
826
826
|
knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
|
827
827
|
)
|
828
|
-
feature_importances[
|
828
|
+
feature_importances["KNN"] = knn_importances.head(n_features)
|
829
829
|
|
830
830
|
#! Find common features
|
831
831
|
common_features = ips.shared(
|
@@ -928,7 +928,7 @@ def get_features(
|
|
928
928
|
"cv_train_scores": cv_train_results_df,
|
929
929
|
"cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
|
930
930
|
"common_features": list(common_features),
|
931
|
-
"feature_importances":feature_importances
|
931
|
+
"feature_importances": feature_importances,
|
932
932
|
}
|
933
933
|
if all([plot_, dir_save]):
|
934
934
|
from datetime import datetime
|
@@ -941,7 +941,7 @@ def get_features(
|
|
941
941
|
"cv_train_scores": pd.DataFrame(),
|
942
942
|
"cv_test_scores": pd.DataFrame(),
|
943
943
|
"common_features": [],
|
944
|
-
"feature_importances":{}
|
944
|
+
"feature_importances": {},
|
945
945
|
}
|
946
946
|
print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
|
947
947
|
return results
|
@@ -1232,7 +1232,7 @@ def validate_features(
|
|
1232
1232
|
|
1233
1233
|
# # If you want to access validation scores
|
1234
1234
|
# print(validation_results)
|
1235
|
-
def plot_validate_features(res_val,is_binary=True,figsize=None):
|
1235
|
+
def plot_validate_features(res_val, is_binary=True, figsize=None):
|
1236
1236
|
"""
|
1237
1237
|
plot the results of 'validate_features()'
|
1238
1238
|
"""
|
@@ -1295,26 +1295,28 @@ def plot_validate_features(res_val,is_binary=True,figsize=None):
|
|
1295
1295
|
)
|
1296
1296
|
plot.figsets(
|
1297
1297
|
sp=2,
|
1298
|
-
legend=dict(
|
1298
|
+
legend=dict(
|
1299
|
+
loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]
|
1300
|
+
),
|
1299
1301
|
)
|
1300
1302
|
# plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
|
1301
1303
|
else:
|
1302
1304
|
colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
|
1303
|
-
modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
|
1304
|
-
classes=list(res_val["roc_curve"][modname_tmp][
|
1305
|
+
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
1306
|
+
classes = list(res_val["roc_curve"][modname_tmp]["fpr"].keys())
|
1305
1307
|
if res_val.shape[0] > 5:
|
1306
1308
|
alpha = 0
|
1307
|
-
figsize = [8, 8*2*(len(classes))]
|
1309
|
+
figsize = [8, 8 * 2 * (len(classes))] if figsize is None else figsize
|
1308
1310
|
subplot_layout = [1, 2]
|
1309
1311
|
ncols = 2
|
1310
1312
|
bbox_to_anchor = [1.5, 0.6]
|
1311
1313
|
else:
|
1312
1314
|
alpha = 0.03
|
1313
|
-
figsize = [10, 6*(len(classes))] if figsize is None else figsize
|
1315
|
+
figsize = [10, 6 * (len(classes))] if figsize is None else figsize
|
1314
1316
|
subplot_layout = [1, 1]
|
1315
1317
|
ncols = 1
|
1316
1318
|
bbox_to_anchor = [1, 1]
|
1317
|
-
nexttile = plot.subplot(2*(len(classes)),2,figsize=figsize)
|
1319
|
+
nexttile = plot.subplot(2 * (len(classes)), 2, figsize=figsize)
|
1318
1320
|
for iclass, class_ in enumerate(classes):
|
1319
1321
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1320
1322
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
@@ -1352,7 +1354,9 @@ def plot_validate_features(res_val,is_binary=True,figsize=None):
|
|
1352
1354
|
plot_pr_curve(
|
1353
1355
|
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1354
1356
|
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1355
|
-
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1357
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1358
|
+
iclass
|
1359
|
+
],
|
1356
1360
|
model_name=model_name,
|
1357
1361
|
color=colors[i],
|
1358
1362
|
lw=1.5,
|
@@ -1362,13 +1366,20 @@ def plot_validate_features(res_val,is_binary=True,figsize=None):
|
|
1362
1366
|
plot.figsets(
|
1363
1367
|
sp=2,
|
1364
1368
|
title=class_,
|
1365
|
-
legend=dict(
|
1369
|
+
legend=dict(
|
1370
|
+
loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]
|
1371
|
+
),
|
1366
1372
|
)
|
1367
1373
|
|
1368
|
-
|
1374
|
+
|
1375
|
+
def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
1369
1376
|
if is_binary:
|
1370
1377
|
if figsize is None:
|
1371
|
-
nexttile = plot.subplot(
|
1378
|
+
nexttile = plot.subplot(
|
1379
|
+
len(ips.flatten(res_val["pr_curve"].index)),
|
1380
|
+
3,
|
1381
|
+
figsize=[13, 4 * len(ips.flatten(res_val["pr_curve"].index))],
|
1382
|
+
)
|
1372
1383
|
else:
|
1373
1384
|
nexttile = plot.subplot(
|
1374
1385
|
len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
|
@@ -1380,8 +1391,15 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1380
1391
|
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1381
1392
|
|
1382
1393
|
# Plotting
|
1383
|
-
plot_roc_curve(
|
1384
|
-
|
1394
|
+
plot_roc_curve(
|
1395
|
+
fpr,
|
1396
|
+
tpr,
|
1397
|
+
mean_auc,
|
1398
|
+
lower_ci,
|
1399
|
+
upper_ci,
|
1400
|
+
model_name=model_name,
|
1401
|
+
ax=nexttile(),
|
1402
|
+
)
|
1385
1403
|
plot.figsets(title=model_name, sp=2)
|
1386
1404
|
|
1387
1405
|
plot_pr_binary(
|
@@ -1394,14 +1412,18 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1394
1412
|
plot.figsets(title=model_name, sp=2)
|
1395
1413
|
|
1396
1414
|
# plot cm
|
1397
|
-
plot_cm(
|
1415
|
+
plot_cm(
|
1416
|
+
res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
|
1417
|
+
)
|
1398
1418
|
plot.figsets(title=model_name, sp=2)
|
1399
1419
|
else:
|
1400
|
-
|
1401
|
-
modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
|
1402
|
-
classes=list(res_val["roc_curve"][modname_tmp][
|
1420
|
+
|
1421
|
+
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
1422
|
+
classes = list(res_val["roc_curve"][modname_tmp]["fpr"].keys())
|
1403
1423
|
if figsize is None:
|
1404
|
-
nexttile = plot.subplot(
|
1424
|
+
nexttile = plot.subplot(
|
1425
|
+
len(modname_tmp), 3, figsize=[15, len(modname_tmp) * 5]
|
1426
|
+
)
|
1405
1427
|
else:
|
1406
1428
|
nexttile = plot.subplot(len(modname_tmp), 3, figsize=figsize)
|
1407
1429
|
colors = plot.get_color(len(classes))
|
@@ -1429,16 +1451,18 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1429
1451
|
title=model_name,
|
1430
1452
|
legend=dict(
|
1431
1453
|
loc="best",
|
1432
|
-
fontsize=8,
|
1454
|
+
fontsize=8,
|
1433
1455
|
),
|
1434
|
-
)
|
1456
|
+
)
|
1435
1457
|
|
1436
1458
|
ax = nexttile()
|
1437
1459
|
for iclass, class_ in enumerate(classes):
|
1438
1460
|
plot_pr_curve(
|
1439
1461
|
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1440
1462
|
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1441
|
-
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1463
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1464
|
+
iclass
|
1465
|
+
],
|
1442
1466
|
model_name=class_,
|
1443
1467
|
color=colors[iclass],
|
1444
1468
|
lw=1.5,
|
@@ -1450,17 +1474,21 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1450
1474
|
title=class_,
|
1451
1475
|
legend=dict(loc="best", fontsize=8),
|
1452
1476
|
)
|
1453
|
-
|
1454
|
-
plot_cm(
|
1477
|
+
|
1478
|
+
plot_cm(
|
1479
|
+
res_val["confusion_matrix"][model_name],
|
1480
|
+
labels_name=classes,
|
1481
|
+
ax=nexttile(),
|
1482
|
+
normalize=False,
|
1483
|
+
)
|
1455
1484
|
plot.figsets(title=model_name, sp=2)
|
1456
1485
|
|
1457
1486
|
|
1458
|
-
def cal_precision_recall(
|
1459
|
-
y_true, y_pred_proba, is_binary=True):
|
1487
|
+
def cal_precision_recall(y_true, y_pred_proba, is_binary=True):
|
1460
1488
|
if is_binary:
|
1461
1489
|
precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
|
1462
1490
|
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
1463
|
-
return precision_, recall_,avg_precision_
|
1491
|
+
return precision_, recall_, avg_precision_
|
1464
1492
|
else:
|
1465
1493
|
n_classes = y_pred_proba.shape[1] # Number of classes
|
1466
1494
|
precision_ = []
|
@@ -1469,7 +1497,9 @@ def cal_precision_recall(
|
|
1469
1497
|
# One-vs-rest approach for multi-class precision-recall curve
|
1470
1498
|
for class_idx in range(n_classes):
|
1471
1499
|
precision, recall, _ = precision_recall_curve(
|
1472
|
-
(y_true == class_idx).astype(
|
1500
|
+
(y_true == class_idx).astype(
|
1501
|
+
int
|
1502
|
+
), # Binarize true labels for the current class
|
1473
1503
|
y_pred_proba[:, class_idx], # Probabilities for the current class
|
1474
1504
|
)
|
1475
1505
|
|
@@ -1479,14 +1509,23 @@ def cal_precision_recall(
|
|
1479
1509
|
avg_precision_ = []
|
1480
1510
|
for class_idx in range(n_classes):
|
1481
1511
|
avg_precision = average_precision_score(
|
1482
|
-
(y_true == class_idx).astype(
|
1512
|
+
(y_true == class_idx).astype(
|
1513
|
+
int
|
1514
|
+
), # Binarize true labels for the current class
|
1483
1515
|
y_pred_proba[:, class_idx], # Probabilities for the current class
|
1484
1516
|
)
|
1485
1517
|
avg_precision_.append(avg_precision)
|
1486
|
-
return precision_, recall_,avg_precision_
|
1487
|
-
|
1518
|
+
return precision_, recall_, avg_precision_
|
1519
|
+
|
1520
|
+
|
1488
1521
|
def cal_auc_ci(
|
1489
|
-
y_true,
|
1522
|
+
y_true,
|
1523
|
+
y_pred,
|
1524
|
+
n_bootstraps=1000,
|
1525
|
+
ci=0.95,
|
1526
|
+
random_state=1,
|
1527
|
+
is_binary=True,
|
1528
|
+
verbose=True,
|
1490
1529
|
):
|
1491
1530
|
if is_binary:
|
1492
1531
|
y_true = np.asarray(y_true)
|
@@ -1525,15 +1564,20 @@ def cal_auc_ci(
|
|
1525
1564
|
return confidence_lower, confidence_upper
|
1526
1565
|
else:
|
1527
1566
|
from sklearn.preprocessing import label_binarize
|
1567
|
+
|
1528
1568
|
# Multi-class classification case
|
1529
1569
|
y_true = np.asarray(y_true)
|
1530
1570
|
y_pred = np.asarray(y_pred)
|
1531
1571
|
|
1532
1572
|
# Binarize the multi-class labels for OvR computation
|
1533
|
-
y_true_bin = label_binarize(
|
1573
|
+
y_true_bin = label_binarize(
|
1574
|
+
y_true, classes=np.unique(y_true)
|
1575
|
+
) # One-vs-Rest transformation
|
1534
1576
|
n_classes = y_true_bin.shape[1] # Number of classes
|
1535
|
-
|
1536
|
-
bootstrapped_scores = np.zeros(
|
1577
|
+
|
1578
|
+
bootstrapped_scores = np.zeros(
|
1579
|
+
(n_classes, n_bootstraps)
|
1580
|
+
) # Store scores for each class
|
1537
1581
|
|
1538
1582
|
if verbose:
|
1539
1583
|
print("AUROC scores for each class:")
|
@@ -1546,7 +1590,9 @@ def cal_auc_ci(
|
|
1546
1590
|
for class_idx in range(n_classes):
|
1547
1591
|
if len(np.unique(y_true_bin[indices, class_idx])) < 2:
|
1548
1592
|
continue # Reject if the class doesn't have both positive and negative samples
|
1549
|
-
score = roc_auc_score(
|
1593
|
+
score = roc_auc_score(
|
1594
|
+
y_true_bin[indices, class_idx], y_pred[indices, class_idx]
|
1595
|
+
)
|
1550
1596
|
bootstrapped_scores[class_idx, i] = score
|
1551
1597
|
|
1552
1598
|
# Calculating the confidence intervals for each class
|
@@ -1558,8 +1604,10 @@ def cal_auc_ci(
|
|
1558
1604
|
confidence_intervals.append((confidence_lower, confidence_upper))
|
1559
1605
|
|
1560
1606
|
if verbose:
|
1561
|
-
print(
|
1562
|
-
|
1607
|
+
print(
|
1608
|
+
f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
|
1609
|
+
)
|
1610
|
+
|
1563
1611
|
return confidence_intervals
|
1564
1612
|
|
1565
1613
|
|
@@ -1619,6 +1667,7 @@ def plot_roc_curve(
|
|
1619
1667
|
# ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
|
1620
1668
|
# figsets(title=model_name)
|
1621
1669
|
|
1670
|
+
|
1622
1671
|
def plot_pr_curve(
|
1623
1672
|
recall=None,
|
1624
1673
|
precision=None,
|
@@ -1661,6 +1710,7 @@ def plot_pr_curve(
|
|
1661
1710
|
ax.legend(loc=legend_loc)
|
1662
1711
|
return ax
|
1663
1712
|
|
1713
|
+
|
1664
1714
|
# * usage: ml2ls.plot_pr_curve()
|
1665
1715
|
# for md_name in flatten(validation_results["pr_curve"].keys()):
|
1666
1716
|
# ml2ls.plot_pr_curve(
|
@@ -1673,6 +1723,7 @@ def plot_pr_curve(
|
|
1673
1723
|
# color="r",
|
1674
1724
|
# )
|
1675
1725
|
|
1726
|
+
|
1676
1727
|
def plot_pr_binary(
|
1677
1728
|
recall=None,
|
1678
1729
|
precision=None,
|
@@ -1689,19 +1740,20 @@ def plot_pr_binary(
|
|
1689
1740
|
ax=None,
|
1690
1741
|
show_avg_precision=False,
|
1691
1742
|
**kwargs,
|
1692
|
-
|
1743
|
+
):
|
1693
1744
|
from scipy.interpolate import interp1d
|
1745
|
+
|
1694
1746
|
if ax is None:
|
1695
1747
|
fig, ax = plt.subplots(figsize=figsize)
|
1696
1748
|
model_name = "Binary PR Curve" if model_name is None else model_name
|
1697
1749
|
|
1698
|
-
|
1750
|
+
# * use sklearn bulitin function 'PrecisionRecallDisplay'?
|
1699
1751
|
# from sklearn.metrics import PrecisionRecallDisplay
|
1700
|
-
# disp = PrecisionRecallDisplay(precision=precision,
|
1701
|
-
# recall=recall,
|
1752
|
+
# disp = PrecisionRecallDisplay(precision=precision,
|
1753
|
+
# recall=recall,
|
1702
1754
|
# average_precision=avg_precision,**kwargs)
|
1703
1755
|
# disp.plot(ax=ax, name=model_name, color=color)
|
1704
|
-
|
1756
|
+
|
1705
1757
|
# Plot Precision-Recall curve
|
1706
1758
|
ax.plot(
|
1707
1759
|
recall,
|
@@ -1729,15 +1781,17 @@ def plot_pr_binary(
|
|
1729
1781
|
y_vals = f_score * x_vals / (2 * x_vals - f_score)
|
1730
1782
|
y_vals_clipped = np.minimum(y_vals, pr_boundary(x_vals))
|
1731
1783
|
y_vals_clipped = np.clip(y_vals_clipped, 1e-3, None) # Prevent going to zero
|
1732
|
-
valid =
|
1733
|
-
valid_ = y_vals_clipped > 1e-3
|
1734
|
-
valid = valid&valid_
|
1735
|
-
x_vals = x_vals[valid]
|
1784
|
+
valid = y_vals_clipped < pr_boundary(x_vals)
|
1785
|
+
valid_ = y_vals_clipped > 1e-3
|
1786
|
+
valid = valid & valid_
|
1787
|
+
x_vals = x_vals[valid]
|
1736
1788
|
y_vals_clipped = y_vals_clipped[valid]
|
1737
1789
|
if len(x_vals) > 0: # Ensure annotation is placed only if line segment exists
|
1738
1790
|
ax.plot(x_vals, y_vals_clipped, color="gray", alpha=1)
|
1739
|
-
plt.annotate(
|
1740
|
-
|
1791
|
+
plt.annotate(
|
1792
|
+
f"$f_1={f_score:0.1f}$",
|
1793
|
+
xy=(0.8, y_vals_clipped[-int(len(y_vals_clipped) * 0.35)] + 0.02),
|
1794
|
+
)
|
1741
1795
|
|
1742
1796
|
# # Plot the average precision line
|
1743
1797
|
if show_avg_precision:
|
@@ -1757,11 +1811,12 @@ def plot_pr_binary(
|
|
1757
1811
|
ax.grid(False)
|
1758
1812
|
ax.legend(loc=legend_loc)
|
1759
1813
|
return ax
|
1760
|
-
|
1814
|
+
|
1815
|
+
|
1761
1816
|
def plot_cm(
|
1762
1817
|
cm,
|
1763
1818
|
labels_name=None,
|
1764
|
-
thresh=0.8,
|
1819
|
+
thresh=0.8, # for set color
|
1765
1820
|
axis_labels=None,
|
1766
1821
|
cmap="Reds",
|
1767
1822
|
normalize=True,
|
@@ -2048,7 +2103,7 @@ def predict(
|
|
2048
2103
|
y_train: pd.Series,
|
2049
2104
|
x_true: pd.DataFrame = None,
|
2050
2105
|
y_true: Optional[pd.Series] = None,
|
2051
|
-
backward:bool=False,
|
2106
|
+
backward: bool = False, # backward_regression
|
2052
2107
|
common_features: set = None,
|
2053
2108
|
purpose: str = "classification", # 'classification' or 'regression'
|
2054
2109
|
cls: Optional[Dict[str, Any]] = None,
|
@@ -2242,22 +2297,22 @@ def predict(
|
|
2242
2297
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2243
2298
|
# else:
|
2244
2299
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2245
|
-
y_train=pd.DataFrame(y_train)
|
2300
|
+
y_train = pd.DataFrame(y_train)
|
2246
2301
|
if y_train.select_dtypes(include=np.number).empty:
|
2247
|
-
y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
|
2248
|
-
is_binary = False if y_train_.shape[1] >2 else True
|
2302
|
+
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
2303
|
+
is_binary = False if y_train_.shape[1] > 2 else True
|
2249
2304
|
else:
|
2250
|
-
y_train_=ips.flatten(y_train.values)
|
2251
|
-
is_binary = False if len(y_train_)>2 else True
|
2305
|
+
y_train_ = ips.flatten(y_train.values)
|
2306
|
+
is_binary = False if len(y_train_) > 2 else True
|
2252
2307
|
|
2253
2308
|
if is_binary:
|
2254
|
-
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2255
|
-
print(
|
2309
|
+
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2310
|
+
print("is_binary:", is_binary)
|
2256
2311
|
|
2257
2312
|
# Perform backward feature selection
|
2258
2313
|
if backward:
|
2259
2314
|
selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
|
2260
|
-
x_train=x_train[selected_features]
|
2315
|
+
x_train = x_train[selected_features]
|
2261
2316
|
|
2262
2317
|
if x_true is None:
|
2263
2318
|
x_train, x_true, y_train, y_true = train_test_split(
|
@@ -2271,23 +2326,31 @@ def predict(
|
|
2271
2326
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2272
2327
|
y_train_col_name = y_train
|
2273
2328
|
y_train = x_train[y_train]
|
2274
|
-
y_train =
|
2329
|
+
y_train = (
|
2330
|
+
ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2331
|
+
if is_binary
|
2332
|
+
else y_train
|
2333
|
+
)
|
2275
2334
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2276
2335
|
if is_binary:
|
2277
2336
|
y_train = ips.df_encoder(
|
2278
2337
|
pd.DataFrame(y_train), method="label"
|
2279
|
-
).values.ravel()
|
2338
|
+
).values.ravel()
|
2280
2339
|
|
2281
2340
|
if y_true is not None:
|
2282
2341
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2283
2342
|
y_true_col_name = y_true
|
2284
2343
|
y_true = x_true[y_true]
|
2285
|
-
y_true =
|
2286
|
-
|
2344
|
+
y_true = (
|
2345
|
+
ips.df_encoder(pd.DataFrame(y_true), method="label")
|
2346
|
+
if is_binary
|
2347
|
+
else y_true
|
2348
|
+
)
|
2349
|
+
y_true = pd.DataFrame(y_true)
|
2287
2350
|
x_true = x_true.drop(y_true_col_name, axis=1)
|
2288
2351
|
if is_binary:
|
2289
2352
|
y_true = ips.df_encoder(pd.DataFrame(y_true), method="label").values.ravel()
|
2290
|
-
y_true =
|
2353
|
+
y_true = pd.DataFrame(y_true)
|
2291
2354
|
|
2292
2355
|
# to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
|
2293
2356
|
|
@@ -2295,10 +2358,14 @@ def predict(
|
|
2295
2358
|
# y_true=y_true.values.ravel() if y_true is not None else None
|
2296
2359
|
if y_train is not None:
|
2297
2360
|
y_train = (
|
2298
|
-
y_train.ravel()
|
2361
|
+
y_train.ravel()
|
2362
|
+
if isinstance(y_train, np.ndarray)
|
2363
|
+
else y_train.values.ravel()
|
2299
2364
|
)
|
2300
2365
|
if y_true is not None:
|
2301
|
-
y_true =
|
2366
|
+
y_true = (
|
2367
|
+
y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
|
2368
|
+
)
|
2302
2369
|
# Ensure common features are selected
|
2303
2370
|
if common_features is not None:
|
2304
2371
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
@@ -2307,7 +2374,9 @@ def predict(
|
|
2307
2374
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2308
2375
|
|
2309
2376
|
x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
|
2310
|
-
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2377
|
+
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2378
|
+
x_true, method="dummy"
|
2379
|
+
)
|
2311
2380
|
# Handle class imbalance using SMOTE (only for classification)
|
2312
2381
|
if (
|
2313
2382
|
smote
|
@@ -2320,11 +2389,11 @@ def predict(
|
|
2320
2389
|
x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
|
2321
2390
|
if not is_binary:
|
2322
2391
|
if isinstance(y_train, np.ndarray):
|
2323
|
-
y_train = ips.df_encoder(data=pd.DataFrame(y_train),method=
|
2324
|
-
y_train=np.asarray(y_train)
|
2392
|
+
y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
|
2393
|
+
y_train = np.asarray(y_train)
|
2325
2394
|
if isinstance(y_train, np.ndarray):
|
2326
|
-
y_true = ips.df_encoder(data=pd.DataFrame(y_true),method=
|
2327
|
-
y_true=np.asarray(y_true)
|
2395
|
+
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2396
|
+
y_true = np.asarray(y_true)
|
2328
2397
|
# Hyperparameter grids for tuning
|
2329
2398
|
if cv_level in ["low", "simple", "s", "l"]:
|
2330
2399
|
param_grids = {
|
@@ -2908,14 +2977,16 @@ def predict(
|
|
2908
2977
|
clf,
|
2909
2978
|
param_grid=param_grids.get(name, {}),
|
2910
2979
|
scoring=(
|
2911
|
-
"roc_auc"
|
2980
|
+
"roc_auc"
|
2981
|
+
if purpose == "classification"
|
2982
|
+
else "neg_mean_squared_error"
|
2912
2983
|
),
|
2913
2984
|
cv=cv,
|
2914
2985
|
n_jobs=n_jobs,
|
2915
2986
|
verbose=verbose,
|
2916
2987
|
)
|
2917
2988
|
|
2918
|
-
gs.fit(x_train, y_train)
|
2989
|
+
gs.fit(x_train, y_train)
|
2919
2990
|
best_clf = gs.best_estimator_
|
2920
2991
|
# make sure x_train and x_test has the same name
|
2921
2992
|
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
@@ -2924,7 +2995,9 @@ def predict(
|
|
2924
2995
|
y_pred_proba = best_clf.predict_proba(x_true)
|
2925
2996
|
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
2926
2997
|
if y_pred_proba.shape[1] == 1:
|
2927
|
-
y_pred_proba = np.hstack(
|
2998
|
+
y_pred_proba = np.hstack(
|
2999
|
+
[1 - y_pred_proba, y_pred_proba]
|
3000
|
+
) # Add missing class probabilities
|
2928
3001
|
y_pred_proba = y_pred_proba[:, 1]
|
2929
3002
|
elif hasattr(best_clf, "decision_function"):
|
2930
3003
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
@@ -2940,7 +3013,9 @@ def predict(
|
|
2940
3013
|
clf,
|
2941
3014
|
param_grid=param_grids.get(name, {}),
|
2942
3015
|
scoring=(
|
2943
|
-
"roc_auc_ovr"
|
3016
|
+
"roc_auc_ovr"
|
3017
|
+
if purpose == "classification"
|
3018
|
+
else "neg_mean_squared_error"
|
2944
3019
|
),
|
2945
3020
|
cv=cv,
|
2946
3021
|
n_jobs=n_jobs,
|
@@ -2948,7 +3023,7 @@ def predict(
|
|
2948
3023
|
)
|
2949
3024
|
|
2950
3025
|
# Fit GridSearchCV
|
2951
|
-
gs.fit(x_train, y_train)
|
3026
|
+
gs.fit(x_train, y_train)
|
2952
3027
|
best_clf = gs.best_estimator_
|
2953
3028
|
|
2954
3029
|
# Ensure x_true aligns with x_train columns
|
@@ -2960,14 +3035,18 @@ def predict(
|
|
2960
3035
|
y_pred_proba = best_clf.predict_proba(x_true)
|
2961
3036
|
elif hasattr(best_clf, "decision_function"):
|
2962
3037
|
y_pred_proba = best_clf.decision_function(x_true)
|
2963
|
-
|
3038
|
+
|
2964
3039
|
# Normalize for multiclass if necessary
|
2965
3040
|
if y_pred_proba.ndim == 2:
|
2966
|
-
y_pred_proba = (
|
2967
|
-
|
3041
|
+
y_pred_proba = (
|
3042
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3043
|
+
) / (
|
3044
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3045
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3046
|
+
)
|
2968
3047
|
else:
|
2969
3048
|
y_pred_proba = None # No probability output for certain models
|
2970
|
-
|
3049
|
+
|
2971
3050
|
validation_scores = {}
|
2972
3051
|
|
2973
3052
|
if y_true is not None and y_pred_proba is not None:
|
@@ -2985,7 +3064,9 @@ def predict(
|
|
2985
3064
|
if y_pred_proba is not None:
|
2986
3065
|
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2987
3066
|
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2988
|
-
lower_ci, upper_ci = cal_auc_ci(
|
3067
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3068
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
3069
|
+
)
|
2989
3070
|
roc_auc = auc(fpr, tpr)
|
2990
3071
|
roc_info = {
|
2991
3072
|
"fpr": fpr.tolist(),
|
@@ -3030,11 +3111,13 @@ def predict(
|
|
3030
3111
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3031
3112
|
),
|
3032
3113
|
}
|
3033
|
-
else:
|
3114
|
+
else: # multi-classes
|
3034
3115
|
if y_pred_proba is not None:
|
3035
3116
|
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3036
3117
|
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
3037
|
-
confidence_intervals = cal_auc_ci(
|
3118
|
+
confidence_intervals = cal_auc_ci(
|
3119
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
3120
|
+
)
|
3038
3121
|
roc_info = {
|
3039
3122
|
"fpr": validation_scores["fpr"],
|
3040
3123
|
"tpr": validation_scores["tpr"],
|
@@ -3042,7 +3125,9 @@ def predict(
|
|
3042
3125
|
"ci95": confidence_intervals,
|
3043
3126
|
}
|
3044
3127
|
# precision-recall curve
|
3045
|
-
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3128
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3129
|
+
y_true, y_pred_proba, is_binary=is_binary
|
3130
|
+
)
|
3046
3131
|
pr_info = {
|
3047
3132
|
"precision": precision_,
|
3048
3133
|
"recall": recall_,
|
@@ -3080,14 +3165,17 @@ def predict(
|
|
3080
3165
|
}
|
3081
3166
|
|
3082
3167
|
else:
|
3083
|
-
|
3084
|
-
|
3085
|
-
|
3086
|
-
|
3087
|
-
|
3088
|
-
|
3089
|
-
|
3090
|
-
|
3168
|
+
if not y_true:
|
3169
|
+
validation_scores = []
|
3170
|
+
else:
|
3171
|
+
validation_scores = cal_metrics(
|
3172
|
+
y_true,
|
3173
|
+
y_pred,
|
3174
|
+
y_pred_proba=y_pred_proba,
|
3175
|
+
is_binary=is_binary,
|
3176
|
+
purpose=purpose,
|
3177
|
+
average="weighted",
|
3178
|
+
)
|
3091
3179
|
results[name] = {
|
3092
3180
|
"best_clf": gs.best_estimator_,
|
3093
3181
|
"best_params": gs.best_params_,
|
@@ -3096,8 +3184,8 @@ def predict(
|
|
3096
3184
|
"predictions_proba": (
|
3097
3185
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3098
3186
|
),
|
3099
|
-
"y_train":y_train if y_train is not None else [],
|
3100
|
-
"y_true": y_true if y_true is not None else []
|
3187
|
+
"y_train": y_train if y_train is not None else [],
|
3188
|
+
"y_true": y_true if y_true is not None else [],
|
3101
3189
|
}
|
3102
3190
|
|
3103
3191
|
# Convert results to DataFrame
|
@@ -3118,8 +3206,8 @@ def predict(
|
|
3118
3206
|
plot.figsets(xangle=30)
|
3119
3207
|
if dir_save:
|
3120
3208
|
ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
|
3121
|
-
|
3122
|
-
df_scores=df_scores.select_dtypes(include=np.number)
|
3209
|
+
|
3210
|
+
df_scores = df_scores.select_dtypes(include=np.number)
|
3123
3211
|
|
3124
3212
|
if df_scores.shape[0] > 1: # draw cluster
|
3125
3213
|
plot.heatmap(df_scores, kind="direct", cluster=True)
|
@@ -3129,7 +3217,7 @@ def predict(
|
|
3129
3217
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3130
3218
|
# try:
|
3131
3219
|
if len(models) > 3:
|
3132
|
-
plot_validate_features(df_results,is_binary=is_binary)
|
3220
|
+
plot_validate_features(df_results, is_binary=is_binary)
|
3133
3221
|
else:
|
3134
3222
|
plot_validate_features_single(df_results, is_binary=is_binary)
|
3135
3223
|
if dir_save:
|
@@ -3140,7 +3228,12 @@ def predict(
|
|
3140
3228
|
|
3141
3229
|
|
3142
3230
|
def cal_metrics(
|
3143
|
-
y_true,
|
3231
|
+
y_true,
|
3232
|
+
y_pred,
|
3233
|
+
y_pred_proba=None,
|
3234
|
+
is_binary=True,
|
3235
|
+
purpose="regression",
|
3236
|
+
average="weighted",
|
3144
3237
|
):
|
3145
3238
|
"""
|
3146
3239
|
Calculate regression or classification metrics based on the purpose.
|
@@ -3216,33 +3309,38 @@ def cal_metrics(
|
|
3216
3309
|
tn, fp, fn, tp = cm.ravel()
|
3217
3310
|
else:
|
3218
3311
|
# Handle single-class predictions
|
3219
|
-
tn, fp, fn, tp = 0, 0, 0, 0
|
3312
|
+
tn, fp, fn, tp = 0, 0, 0, 0
|
3220
3313
|
print("Warning: Only one class found in y_pred or y_true.")
|
3221
3314
|
|
3222
3315
|
# Specificity calculation
|
3223
|
-
validation_scores["specificity"] = (
|
3224
|
-
tn / (tn + fp) if (tn + fp) > 0 else 0
|
3225
|
-
)
|
3316
|
+
validation_scores["specificity"] = tn / (tn + fp) if (tn + fp) > 0 else 0
|
3226
3317
|
if y_pred_proba is not None:
|
3227
3318
|
# Calculate ROC-AUC
|
3228
3319
|
validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
|
3229
3320
|
# PR-AUC (Precision-Recall AUC) calculation
|
3230
|
-
validation_scores["pr_auc"] = average_precision_score(
|
3231
|
-
|
3232
|
-
|
3321
|
+
validation_scores["pr_auc"] = average_precision_score(
|
3322
|
+
y_true, y_pred_proba
|
3323
|
+
)
|
3324
|
+
|
3325
|
+
else: # multi-class
|
3233
3326
|
from sklearn.preprocessing import label_binarize
|
3234
|
-
|
3235
|
-
|
3327
|
+
|
3328
|
+
# * Multi-class ROC calculation
|
3329
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
3236
3330
|
classes = np.unique(y_true)
|
3237
3331
|
y_true_bin = label_binarize(y_true, classes=classes)
|
3238
3332
|
if isinstance(y_true, np.ndarray):
|
3239
|
-
y_true = ips.df_encoder(
|
3333
|
+
y_true = ips.df_encoder(
|
3334
|
+
data=pd.DataFrame(y_true), method="dum", prefix="Label"
|
3335
|
+
)
|
3240
3336
|
# Initialize dictionaries to store FPR, TPR, and AUC for each class
|
3241
3337
|
fpr = dict()
|
3242
3338
|
tpr = dict()
|
3243
|
-
roc_auc = dict()
|
3339
|
+
roc_auc = dict()
|
3244
3340
|
for i, class_label in enumerate(classes):
|
3245
|
-
fpr[class_label], tpr[class_label], _ = roc_curve(
|
3341
|
+
fpr[class_label], tpr[class_label], _ = roc_curve(
|
3342
|
+
y_true_bin[:, i], y_pred_proba[:, i]
|
3343
|
+
)
|
3246
3344
|
roc_auc[class_label] = auc(fpr[class_label], tpr[class_label])
|
3247
3345
|
|
3248
3346
|
# Store the mean ROC AUC
|
@@ -3267,6 +3365,7 @@ def cal_metrics(
|
|
3267
3365
|
|
3268
3366
|
return validation_scores
|
3269
3367
|
|
3368
|
+
|
3270
3369
|
def plot_trees(
|
3271
3370
|
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3272
3371
|
):
|
@@ -3303,6 +3402,7 @@ def plot_trees(
|
|
3303
3402
|
ExtraTreesClassifier,
|
3304
3403
|
)
|
3305
3404
|
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
|
3405
|
+
|
3306
3406
|
# Split data for training and testing error calculation
|
3307
3407
|
x_train, x_test, y_train, y_test = train_test_split(
|
3308
3408
|
X, y, test_size=test_size, random_state=random_state
|
@@ -3361,7 +3461,9 @@ def plot_trees(
|
|
3361
3461
|
if validation_error[-early_stopping_rounds:] == sorted(
|
3362
3462
|
validation_error[-early_stopping_rounds:]
|
3363
3463
|
):
|
3364
|
-
print(
|
3464
|
+
print(
|
3465
|
+
f"Early stopping at tree {i} due to lack of improvement in validation error."
|
3466
|
+
)
|
3365
3467
|
break
|
3366
3468
|
|
3367
3469
|
# Plot results
|
@@ -3407,16 +3509,17 @@ def plot_trees(
|
|
3407
3509
|
plt.grid(True)
|
3408
3510
|
plt.show()
|
3409
3511
|
|
3512
|
+
|
3410
3513
|
def img_datasets_preprocessing(
|
3411
3514
|
data: pd.DataFrame,
|
3412
3515
|
x_col: str,
|
3413
|
-
y_col: str=None,
|
3516
|
+
y_col: str = None,
|
3414
3517
|
target_size: tuple = (224, 224),
|
3415
3518
|
batch_size: int = 128,
|
3416
3519
|
class_mode: str = "raw",
|
3417
3520
|
shuffle: bool = False,
|
3418
3521
|
augment: bool = False,
|
3419
|
-
scaler: str =
|
3522
|
+
scaler: str = "normalize", # 'normalize', 'standardize', 'clahe', 'raw'
|
3420
3523
|
grayscale: bool = False,
|
3421
3524
|
encoder: str = "label", # Options: 'label', 'onehot', 'binary'
|
3422
3525
|
label_encoder=None,
|
@@ -3461,16 +3564,29 @@ def img_datasets_preprocessing(
|
|
3461
3564
|
x_col in data.columns and y_col in data.columns
|
3462
3565
|
), "Missing required columns in DataFrame."
|
3463
3566
|
if y_col is None:
|
3464
|
-
class_mode=None
|
3567
|
+
class_mode = None
|
3465
3568
|
# 输出格式
|
3466
|
-
output = ips.strcmp(
|
3467
|
-
|
3468
|
-
|
3469
|
-
|
3569
|
+
output = ips.strcmp(
|
3570
|
+
output,
|
3571
|
+
[
|
3572
|
+
"generator",
|
3573
|
+
"tf",
|
3574
|
+
"iterator",
|
3575
|
+
"transform",
|
3576
|
+
"transformer",
|
3577
|
+
"dataframe",
|
3578
|
+
"df",
|
3579
|
+
"pd",
|
3580
|
+
"pandas",
|
3581
|
+
],
|
3582
|
+
)[0]
|
3583
|
+
|
3470
3584
|
# Handle missing file paths
|
3471
3585
|
if drop_missing:
|
3472
3586
|
data = data[
|
3473
|
-
data[x_col].apply(
|
3587
|
+
data[x_col].apply(
|
3588
|
+
lambda path: os.path.exists(path) and os.path.isfile(path)
|
3589
|
+
)
|
3474
3590
|
]
|
3475
3591
|
|
3476
3592
|
# Encoding labels if necessary
|
@@ -3502,11 +3618,11 @@ def img_datasets_preprocessing(
|
|
3502
3618
|
aug_params.update(kws_augmentation)
|
3503
3619
|
dat = ImageDataGenerator(rescale=scaler, **aug_params)
|
3504
3620
|
dat = ImageDataGenerator(
|
3505
|
-
|
3621
|
+
rescale=1.0 / 255 if scaler == "normalize" else None, **aug_params
|
3622
|
+
)
|
3506
3623
|
|
3507
3624
|
else:
|
3508
|
-
dat = ImageDataGenerator(
|
3509
|
-
rescale=1.0 / 255 if scaler == 'normalize' else None)
|
3625
|
+
dat = ImageDataGenerator(rescale=1.0 / 255 if scaler == "normalize" else None)
|
3510
3626
|
|
3511
3627
|
# Create DataFrameIterator
|
3512
3628
|
data_iterator = dat.flow_from_dataframe(
|
@@ -3529,14 +3645,14 @@ def img_datasets_preprocessing(
|
|
3529
3645
|
|
3530
3646
|
# Load, resize, and process images in batches
|
3531
3647
|
for i, (batch_images, batch_labels) in enumerate(data_iterator):
|
3532
|
-
for img, label in zip(batch_images, batch_labels):
|
3533
|
-
if scaler == [
|
3648
|
+
for img, label in zip(batch_images, batch_labels):
|
3649
|
+
if scaler == ["normalize", "raw"]:
|
3534
3650
|
# Already rescaled by 1.0/255 in ImageDataGenerator
|
3535
3651
|
pass
|
3536
|
-
elif scaler ==
|
3652
|
+
elif scaler == "standardize":
|
3537
3653
|
# Standardize by subtracting mean and dividing by std
|
3538
3654
|
img = (img - np.mean(img)) / np.std(img)
|
3539
|
-
elif scaler ==
|
3655
|
+
elif scaler == "clahe":
|
3540
3656
|
# Apply CLAHE to the image
|
3541
3657
|
img = apply_clahe(img)
|
3542
3658
|
flat_img = img.flatten()
|
@@ -3561,11 +3677,13 @@ def img_datasets_preprocessing(
|
|
3561
3677
|
return df_img
|
3562
3678
|
|
3563
3679
|
|
3564
|
-
def backward_regression(
|
3680
|
+
def backward_regression(
|
3681
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
|
3682
|
+
):
|
3565
3683
|
"""
|
3566
3684
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
3567
|
-
|
3568
|
-
Evaluates the p-values of all features, which represent the probability of observing a coefficient
|
3685
|
+
|
3686
|
+
Evaluates the p-values of all features, which represent the probability of observing a coefficient
|
3569
3687
|
as extreme as the one calculated if the feature had no true effect on the target.
|
3570
3688
|
|
3571
3689
|
Args:
|
@@ -3576,9 +3694,10 @@ def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_
|
|
3576
3694
|
verbose -- true to produce lots of logging output
|
3577
3695
|
|
3578
3696
|
Returns:
|
3579
|
-
list of selected features for modeling
|
3697
|
+
list of selected features for modeling
|
3580
3698
|
"""
|
3581
3699
|
import statsmodels.api as sm
|
3700
|
+
|
3582
3701
|
if isinstance(y, str) and y in X.columns:
|
3583
3702
|
y_col_name = y
|
3584
3703
|
y = X[y]
|
@@ -3600,15 +3719,16 @@ def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_
|
|
3600
3719
|
break
|
3601
3720
|
print(f"\nSelected Features:\n{included}")
|
3602
3721
|
return included # Returns the list of selected features
|
3603
|
-
|
3722
|
+
|
3604
3723
|
|
3605
3724
|
# Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
3606
3725
|
def apply_clahe(img):
|
3607
3726
|
import cv2
|
3727
|
+
|
3608
3728
|
lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
|
3609
3729
|
l, a, b = cv2.split(lab) # Split into channels
|
3610
3730
|
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
3611
3731
|
cl = clahe.apply(l) # Apply CLAHE to the L channel
|
3612
3732
|
limg = cv2.merge((cl, a, b)) # Merge back the channels
|
3613
3733
|
img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
|
3614
|
-
return img_clahe
|
3734
|
+
return img_clahe
|
@@ -236,7 +236,7 @@ py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
|
|
236
236
|
py2ls/ich2ls.py,sha256=3E9R8oVpyYZXH5PiIQgT3CN5NxLe4Dwtm2LwaeacE6I,21381
|
237
237
|
py2ls/ips.py,sha256=2TWuOSFquwhmPdxkmmvU_pcIbE5M0S9aRPtuQgs5B7A,297706
|
238
238
|
py2ls/ml2ls copy.py,sha256=iZJrFLIrdfTieAY2BDsxQFTm29smwnJh0aC4hRB9VGM,113314
|
239
|
-
py2ls/ml2ls.py,sha256=
|
239
|
+
py2ls/ml2ls.py,sha256=wvQkhcOsBiysgfaRmeT2KAR5C8uFOaX3HeyDA2Oy1LI,146065
|
240
240
|
py2ls/mol.py,sha256=AZnHzarIk_MjueKdChqn1V6e4tUle3X1NnHSFA6n3Nw,10645
|
241
241
|
py2ls/netfinder.py,sha256=R70NkrnO8LlXjT1y7bf2TN-yE4yOeAYhb0jDBiNp8XA,57536
|
242
242
|
py2ls/ocr.py,sha256=5lhUbJufIKRSOL6wAWVLEo8TqMYSjoI_Q-IO-_4u3DE,31419
|
@@ -246,6 +246,6 @@ py2ls/sleep_events_detectors.py,sha256=bQA3HJqv5qnYKJJEIhCyhlDtkXQfIzqksnD0YRXso
|
|
246
246
|
py2ls/stats.py,sha256=qBn2rJmNa_QLLUqjwYqXUlGzqmW94sgA1bxJU2FC3r0,39175
|
247
247
|
py2ls/translator.py,sha256=77Tp_GjmiiwFbEIJD_q3VYpQ43XL9ZeJo6Mhl44mvh8,34284
|
248
248
|
py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
|
249
|
-
py2ls-0.2.4.
|
250
|
-
py2ls-0.2.4.
|
251
|
-
py2ls-0.2.4.
|
249
|
+
py2ls-0.2.4.20.dist-info/METADATA,sha256=iCOFX-A3J17xwkEB2UdDpS5A7kQuRcBCJVq5x8BtqPg,20078
|
250
|
+
py2ls-0.2.4.20.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
251
|
+
py2ls-0.2.4.20.dist-info/RECORD,,
|
File without changes
|