py2ls 0.2.4.18__py3-none-any.whl → 0.2.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py
CHANGED
@@ -702,7 +702,7 @@ def get_features(
|
|
702
702
|
"AdaBoost",
|
703
703
|
]
|
704
704
|
cls = [ips.strcmp(i, cls_)[0] for i in cls]
|
705
|
-
|
705
|
+
|
706
706
|
feature_importances = {}
|
707
707
|
|
708
708
|
# Lasso Feature Selection
|
@@ -714,7 +714,7 @@ def get_features(
|
|
714
714
|
lasso_selected_features = (
|
715
715
|
lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
|
716
716
|
)
|
717
|
-
feature_importances[
|
717
|
+
feature_importances["lasso"] = lasso_importances.head(n_features)
|
718
718
|
# Ridge
|
719
719
|
ridge_importances = (
|
720
720
|
features_ridge(x_train, y_train, ridge_params)
|
@@ -724,7 +724,7 @@ def get_features(
|
|
724
724
|
selected_ridge_features = (
|
725
725
|
ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
|
726
726
|
)
|
727
|
-
feature_importances[
|
727
|
+
feature_importances["ridge"] = ridge_importances.head(n_features)
|
728
728
|
# Elastic Net
|
729
729
|
enet_importances = (
|
730
730
|
features_enet(x_train, y_train, enet_params)
|
@@ -734,7 +734,7 @@ def get_features(
|
|
734
734
|
selected_enet_features = (
|
735
735
|
enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
|
736
736
|
)
|
737
|
-
feature_importances[
|
737
|
+
feature_importances["Enet"] = enet_importances.head(n_features)
|
738
738
|
# Random Forest Feature Importance
|
739
739
|
rf_importances = (
|
740
740
|
features_rf(x_train, y_train, rf_params)
|
@@ -746,7 +746,7 @@ def get_features(
|
|
746
746
|
if "Random Forest" in cls
|
747
747
|
else []
|
748
748
|
)
|
749
|
-
feature_importances[
|
749
|
+
feature_importances["Random Forest"] = rf_importances.head(n_features)
|
750
750
|
# Gradient Boosting Feature Importance
|
751
751
|
gb_importances = (
|
752
752
|
features_gradient_boosting(x_train, y_train, gb_params)
|
@@ -758,7 +758,7 @@ def get_features(
|
|
758
758
|
if "Gradient Boosting" in cls
|
759
759
|
else []
|
760
760
|
)
|
761
|
-
feature_importances[
|
761
|
+
feature_importances["Gradient Boosting"] = gb_importances.head(n_features)
|
762
762
|
# xgb
|
763
763
|
xgb_importances = (
|
764
764
|
features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
|
@@ -766,7 +766,7 @@ def get_features(
|
|
766
766
|
top_xgb_features = (
|
767
767
|
xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
|
768
768
|
)
|
769
|
-
feature_importances[
|
769
|
+
feature_importances["xgb"] = xgb_importances.head(n_features)
|
770
770
|
|
771
771
|
# SVM with RFE
|
772
772
|
selected_svm_features = (
|
@@ -781,7 +781,7 @@ def get_features(
|
|
781
781
|
selected_lda_features = (
|
782
782
|
lda_importances.head(n_features)["feature"].values if "lda" in cls else []
|
783
783
|
)
|
784
|
-
feature_importances[
|
784
|
+
feature_importances["lda"] = lda_importances.head(n_features)
|
785
785
|
# AdaBoost Feature Importance
|
786
786
|
adaboost_importances = (
|
787
787
|
features_adaboost(x_train, y_train, adaboost_params)
|
@@ -793,7 +793,7 @@ def get_features(
|
|
793
793
|
if "AdaBoost" in cls
|
794
794
|
else []
|
795
795
|
)
|
796
|
-
feature_importances[
|
796
|
+
feature_importances["AdaBoost"] = adaboost_importances.head(n_features)
|
797
797
|
# Decision Tree Feature Importance
|
798
798
|
dt_importances = (
|
799
799
|
features_decision_tree(x_train, y_train, dt_params)
|
@@ -804,8 +804,8 @@ def get_features(
|
|
804
804
|
dt_importances.head(n_features)["feature"].values
|
805
805
|
if "Decision Tree" in cls
|
806
806
|
else []
|
807
|
-
)
|
808
|
-
feature_importances[
|
807
|
+
)
|
808
|
+
feature_importances["Decision Tree"] = dt_importances.head(n_features)
|
809
809
|
# Bagging Feature Importance
|
810
810
|
bagging_importances = (
|
811
811
|
features_bagging(x_train, y_train, bagging_params)
|
@@ -817,7 +817,7 @@ def get_features(
|
|
817
817
|
if "Bagging" in cls
|
818
818
|
else []
|
819
819
|
)
|
820
|
-
feature_importances[
|
820
|
+
feature_importances["Bagging"] = bagging_importances.head(n_features)
|
821
821
|
# KNN Feature Importance via Permutation
|
822
822
|
knn_importances = (
|
823
823
|
features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
|
@@ -825,7 +825,7 @@ def get_features(
|
|
825
825
|
top_knn_features = (
|
826
826
|
knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
|
827
827
|
)
|
828
|
-
feature_importances[
|
828
|
+
feature_importances["KNN"] = knn_importances.head(n_features)
|
829
829
|
|
830
830
|
#! Find common features
|
831
831
|
common_features = ips.shared(
|
@@ -928,7 +928,7 @@ def get_features(
|
|
928
928
|
"cv_train_scores": cv_train_results_df,
|
929
929
|
"cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
|
930
930
|
"common_features": list(common_features),
|
931
|
-
"feature_importances":feature_importances
|
931
|
+
"feature_importances": feature_importances,
|
932
932
|
}
|
933
933
|
if all([plot_, dir_save]):
|
934
934
|
from datetime import datetime
|
@@ -941,7 +941,7 @@ def get_features(
|
|
941
941
|
"cv_train_scores": pd.DataFrame(),
|
942
942
|
"cv_test_scores": pd.DataFrame(),
|
943
943
|
"common_features": [],
|
944
|
-
"feature_importances":{}
|
944
|
+
"feature_importances": {},
|
945
945
|
}
|
946
946
|
print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
|
947
947
|
return results
|
@@ -1232,7 +1232,7 @@ def validate_features(
|
|
1232
1232
|
|
1233
1233
|
# # If you want to access validation scores
|
1234
1234
|
# print(validation_results)
|
1235
|
-
def plot_validate_features(res_val,is_binary=True,figsize=None):
|
1235
|
+
def plot_validate_features(res_val, is_binary=True, figsize=None):
|
1236
1236
|
"""
|
1237
1237
|
plot the results of 'validate_features()'
|
1238
1238
|
"""
|
@@ -1295,26 +1295,28 @@ def plot_validate_features(res_val,is_binary=True,figsize=None):
|
|
1295
1295
|
)
|
1296
1296
|
plot.figsets(
|
1297
1297
|
sp=2,
|
1298
|
-
legend=dict(
|
1298
|
+
legend=dict(
|
1299
|
+
loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]
|
1300
|
+
),
|
1299
1301
|
)
|
1300
1302
|
# plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
|
1301
1303
|
else:
|
1302
1304
|
colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
|
1303
|
-
modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
|
1304
|
-
classes=list(res_val["roc_curve"][modname_tmp][
|
1305
|
+
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
1306
|
+
classes = list(res_val["roc_curve"][modname_tmp]["fpr"].keys())
|
1305
1307
|
if res_val.shape[0] > 5:
|
1306
1308
|
alpha = 0
|
1307
|
-
figsize = [8, 8*2*(len(classes))]
|
1309
|
+
figsize = [8, 8 * 2 * (len(classes))] if figsize is None else figsize
|
1308
1310
|
subplot_layout = [1, 2]
|
1309
1311
|
ncols = 2
|
1310
1312
|
bbox_to_anchor = [1.5, 0.6]
|
1311
1313
|
else:
|
1312
1314
|
alpha = 0.03
|
1313
|
-
figsize = [10, 6*(len(classes))] if figsize is None else figsize
|
1315
|
+
figsize = [10, 6 * (len(classes))] if figsize is None else figsize
|
1314
1316
|
subplot_layout = [1, 1]
|
1315
1317
|
ncols = 1
|
1316
1318
|
bbox_to_anchor = [1, 1]
|
1317
|
-
nexttile = plot.subplot(2*(len(classes)),2,figsize=figsize)
|
1319
|
+
nexttile = plot.subplot(2 * (len(classes)), 2, figsize=figsize)
|
1318
1320
|
for iclass, class_ in enumerate(classes):
|
1319
1321
|
ax = nexttile(subplot_layout[0], subplot_layout[1])
|
1320
1322
|
for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
|
@@ -1352,7 +1354,9 @@ def plot_validate_features(res_val,is_binary=True,figsize=None):
|
|
1352
1354
|
plot_pr_curve(
|
1353
1355
|
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1354
1356
|
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1355
|
-
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1357
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1358
|
+
iclass
|
1359
|
+
],
|
1356
1360
|
model_name=model_name,
|
1357
1361
|
color=colors[i],
|
1358
1362
|
lw=1.5,
|
@@ -1362,13 +1366,20 @@ def plot_validate_features(res_val,is_binary=True,figsize=None):
|
|
1362
1366
|
plot.figsets(
|
1363
1367
|
sp=2,
|
1364
1368
|
title=class_,
|
1365
|
-
legend=dict(
|
1369
|
+
legend=dict(
|
1370
|
+
loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]
|
1371
|
+
),
|
1366
1372
|
)
|
1367
1373
|
|
1368
|
-
|
1374
|
+
|
1375
|
+
def plot_validate_features_single(res_val, figsize=None, is_binary=True):
|
1369
1376
|
if is_binary:
|
1370
1377
|
if figsize is None:
|
1371
|
-
nexttile = plot.subplot(
|
1378
|
+
nexttile = plot.subplot(
|
1379
|
+
len(ips.flatten(res_val["pr_curve"].index)),
|
1380
|
+
3,
|
1381
|
+
figsize=[13, 4 * len(ips.flatten(res_val["pr_curve"].index))],
|
1382
|
+
)
|
1372
1383
|
else:
|
1373
1384
|
nexttile = plot.subplot(
|
1374
1385
|
len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
|
@@ -1380,8 +1391,15 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1380
1391
|
mean_auc = res_val["roc_curve"][model_name]["auc"]
|
1381
1392
|
|
1382
1393
|
# Plotting
|
1383
|
-
plot_roc_curve(
|
1384
|
-
|
1394
|
+
plot_roc_curve(
|
1395
|
+
fpr,
|
1396
|
+
tpr,
|
1397
|
+
mean_auc,
|
1398
|
+
lower_ci,
|
1399
|
+
upper_ci,
|
1400
|
+
model_name=model_name,
|
1401
|
+
ax=nexttile(),
|
1402
|
+
)
|
1385
1403
|
plot.figsets(title=model_name, sp=2)
|
1386
1404
|
|
1387
1405
|
plot_pr_binary(
|
@@ -1394,14 +1412,18 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1394
1412
|
plot.figsets(title=model_name, sp=2)
|
1395
1413
|
|
1396
1414
|
# plot cm
|
1397
|
-
plot_cm(
|
1415
|
+
plot_cm(
|
1416
|
+
res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
|
1417
|
+
)
|
1398
1418
|
plot.figsets(title=model_name, sp=2)
|
1399
1419
|
else:
|
1400
|
-
|
1401
|
-
modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
|
1402
|
-
classes=list(res_val["roc_curve"][modname_tmp][
|
1420
|
+
|
1421
|
+
modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
|
1422
|
+
classes = list(res_val["roc_curve"][modname_tmp]["fpr"].keys())
|
1403
1423
|
if figsize is None:
|
1404
|
-
nexttile = plot.subplot(
|
1424
|
+
nexttile = plot.subplot(
|
1425
|
+
len(modname_tmp), 3, figsize=[15, len(modname_tmp) * 5]
|
1426
|
+
)
|
1405
1427
|
else:
|
1406
1428
|
nexttile = plot.subplot(len(modname_tmp), 3, figsize=figsize)
|
1407
1429
|
colors = plot.get_color(len(classes))
|
@@ -1429,16 +1451,18 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1429
1451
|
title=model_name,
|
1430
1452
|
legend=dict(
|
1431
1453
|
loc="best",
|
1432
|
-
fontsize=8,
|
1454
|
+
fontsize=8,
|
1433
1455
|
),
|
1434
|
-
)
|
1456
|
+
)
|
1435
1457
|
|
1436
1458
|
ax = nexttile()
|
1437
1459
|
for iclass, class_ in enumerate(classes):
|
1438
1460
|
plot_pr_curve(
|
1439
1461
|
recall=res_val["pr_curve"][model_name]["recall"][iclass],
|
1440
1462
|
precision=res_val["pr_curve"][model_name]["precision"][iclass],
|
1441
|
-
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1463
|
+
avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
|
1464
|
+
iclass
|
1465
|
+
],
|
1442
1466
|
model_name=class_,
|
1443
1467
|
color=colors[iclass],
|
1444
1468
|
lw=1.5,
|
@@ -1450,17 +1474,21 @@ def plot_validate_features_single(res_val, figsize=None,is_binary=True):
|
|
1450
1474
|
title=class_,
|
1451
1475
|
legend=dict(loc="best", fontsize=8),
|
1452
1476
|
)
|
1453
|
-
|
1454
|
-
plot_cm(
|
1477
|
+
|
1478
|
+
plot_cm(
|
1479
|
+
res_val["confusion_matrix"][model_name],
|
1480
|
+
labels_name=classes,
|
1481
|
+
ax=nexttile(),
|
1482
|
+
normalize=False,
|
1483
|
+
)
|
1455
1484
|
plot.figsets(title=model_name, sp=2)
|
1456
1485
|
|
1457
1486
|
|
1458
|
-
def cal_precision_recall(
|
1459
|
-
y_true, y_pred_proba, is_binary=True):
|
1487
|
+
def cal_precision_recall(y_true, y_pred_proba, is_binary=True):
|
1460
1488
|
if is_binary:
|
1461
1489
|
precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
|
1462
1490
|
avg_precision_ = average_precision_score(y_true, y_pred_proba)
|
1463
|
-
return precision_, recall_,avg_precision_
|
1491
|
+
return precision_, recall_, avg_precision_
|
1464
1492
|
else:
|
1465
1493
|
n_classes = y_pred_proba.shape[1] # Number of classes
|
1466
1494
|
precision_ = []
|
@@ -1469,7 +1497,9 @@ def cal_precision_recall(
|
|
1469
1497
|
# One-vs-rest approach for multi-class precision-recall curve
|
1470
1498
|
for class_idx in range(n_classes):
|
1471
1499
|
precision, recall, _ = precision_recall_curve(
|
1472
|
-
(y_true == class_idx).astype(
|
1500
|
+
(y_true == class_idx).astype(
|
1501
|
+
int
|
1502
|
+
), # Binarize true labels for the current class
|
1473
1503
|
y_pred_proba[:, class_idx], # Probabilities for the current class
|
1474
1504
|
)
|
1475
1505
|
|
@@ -1479,14 +1509,23 @@ def cal_precision_recall(
|
|
1479
1509
|
avg_precision_ = []
|
1480
1510
|
for class_idx in range(n_classes):
|
1481
1511
|
avg_precision = average_precision_score(
|
1482
|
-
(y_true == class_idx).astype(
|
1512
|
+
(y_true == class_idx).astype(
|
1513
|
+
int
|
1514
|
+
), # Binarize true labels for the current class
|
1483
1515
|
y_pred_proba[:, class_idx], # Probabilities for the current class
|
1484
1516
|
)
|
1485
1517
|
avg_precision_.append(avg_precision)
|
1486
|
-
return precision_, recall_,avg_precision_
|
1487
|
-
|
1518
|
+
return precision_, recall_, avg_precision_
|
1519
|
+
|
1520
|
+
|
1488
1521
|
def cal_auc_ci(
|
1489
|
-
y_true,
|
1522
|
+
y_true,
|
1523
|
+
y_pred,
|
1524
|
+
n_bootstraps=1000,
|
1525
|
+
ci=0.95,
|
1526
|
+
random_state=1,
|
1527
|
+
is_binary=True,
|
1528
|
+
verbose=True,
|
1490
1529
|
):
|
1491
1530
|
if is_binary:
|
1492
1531
|
y_true = np.asarray(y_true)
|
@@ -1525,15 +1564,20 @@ def cal_auc_ci(
|
|
1525
1564
|
return confidence_lower, confidence_upper
|
1526
1565
|
else:
|
1527
1566
|
from sklearn.preprocessing import label_binarize
|
1567
|
+
|
1528
1568
|
# Multi-class classification case
|
1529
1569
|
y_true = np.asarray(y_true)
|
1530
1570
|
y_pred = np.asarray(y_pred)
|
1531
1571
|
|
1532
1572
|
# Binarize the multi-class labels for OvR computation
|
1533
|
-
y_true_bin = label_binarize(
|
1573
|
+
y_true_bin = label_binarize(
|
1574
|
+
y_true, classes=np.unique(y_true)
|
1575
|
+
) # One-vs-Rest transformation
|
1534
1576
|
n_classes = y_true_bin.shape[1] # Number of classes
|
1535
|
-
|
1536
|
-
bootstrapped_scores = np.zeros(
|
1577
|
+
|
1578
|
+
bootstrapped_scores = np.zeros(
|
1579
|
+
(n_classes, n_bootstraps)
|
1580
|
+
) # Store scores for each class
|
1537
1581
|
|
1538
1582
|
if verbose:
|
1539
1583
|
print("AUROC scores for each class:")
|
@@ -1546,7 +1590,9 @@ def cal_auc_ci(
|
|
1546
1590
|
for class_idx in range(n_classes):
|
1547
1591
|
if len(np.unique(y_true_bin[indices, class_idx])) < 2:
|
1548
1592
|
continue # Reject if the class doesn't have both positive and negative samples
|
1549
|
-
score = roc_auc_score(
|
1593
|
+
score = roc_auc_score(
|
1594
|
+
y_true_bin[indices, class_idx], y_pred[indices, class_idx]
|
1595
|
+
)
|
1550
1596
|
bootstrapped_scores[class_idx, i] = score
|
1551
1597
|
|
1552
1598
|
# Calculating the confidence intervals for each class
|
@@ -1558,8 +1604,10 @@ def cal_auc_ci(
|
|
1558
1604
|
confidence_intervals.append((confidence_lower, confidence_upper))
|
1559
1605
|
|
1560
1606
|
if verbose:
|
1561
|
-
print(
|
1562
|
-
|
1607
|
+
print(
|
1608
|
+
f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
|
1609
|
+
)
|
1610
|
+
|
1563
1611
|
return confidence_intervals
|
1564
1612
|
|
1565
1613
|
|
@@ -1619,6 +1667,7 @@ def plot_roc_curve(
|
|
1619
1667
|
# ml2ls.plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci)
|
1620
1668
|
# figsets(title=model_name)
|
1621
1669
|
|
1670
|
+
|
1622
1671
|
def plot_pr_curve(
|
1623
1672
|
recall=None,
|
1624
1673
|
precision=None,
|
@@ -1661,6 +1710,7 @@ def plot_pr_curve(
|
|
1661
1710
|
ax.legend(loc=legend_loc)
|
1662
1711
|
return ax
|
1663
1712
|
|
1713
|
+
|
1664
1714
|
# * usage: ml2ls.plot_pr_curve()
|
1665
1715
|
# for md_name in flatten(validation_results["pr_curve"].keys()):
|
1666
1716
|
# ml2ls.plot_pr_curve(
|
@@ -1673,6 +1723,7 @@ def plot_pr_curve(
|
|
1673
1723
|
# color="r",
|
1674
1724
|
# )
|
1675
1725
|
|
1726
|
+
|
1676
1727
|
def plot_pr_binary(
|
1677
1728
|
recall=None,
|
1678
1729
|
precision=None,
|
@@ -1689,19 +1740,20 @@ def plot_pr_binary(
|
|
1689
1740
|
ax=None,
|
1690
1741
|
show_avg_precision=False,
|
1691
1742
|
**kwargs,
|
1692
|
-
|
1743
|
+
):
|
1693
1744
|
from scipy.interpolate import interp1d
|
1745
|
+
|
1694
1746
|
if ax is None:
|
1695
1747
|
fig, ax = plt.subplots(figsize=figsize)
|
1696
1748
|
model_name = "Binary PR Curve" if model_name is None else model_name
|
1697
1749
|
|
1698
|
-
|
1750
|
+
# * use sklearn bulitin function 'PrecisionRecallDisplay'?
|
1699
1751
|
# from sklearn.metrics import PrecisionRecallDisplay
|
1700
|
-
# disp = PrecisionRecallDisplay(precision=precision,
|
1701
|
-
# recall=recall,
|
1752
|
+
# disp = PrecisionRecallDisplay(precision=precision,
|
1753
|
+
# recall=recall,
|
1702
1754
|
# average_precision=avg_precision,**kwargs)
|
1703
1755
|
# disp.plot(ax=ax, name=model_name, color=color)
|
1704
|
-
|
1756
|
+
|
1705
1757
|
# Plot Precision-Recall curve
|
1706
1758
|
ax.plot(
|
1707
1759
|
recall,
|
@@ -1729,15 +1781,17 @@ def plot_pr_binary(
|
|
1729
1781
|
y_vals = f_score * x_vals / (2 * x_vals - f_score)
|
1730
1782
|
y_vals_clipped = np.minimum(y_vals, pr_boundary(x_vals))
|
1731
1783
|
y_vals_clipped = np.clip(y_vals_clipped, 1e-3, None) # Prevent going to zero
|
1732
|
-
valid =
|
1733
|
-
valid_ = y_vals_clipped > 1e-3
|
1734
|
-
valid = valid&valid_
|
1735
|
-
x_vals = x_vals[valid]
|
1784
|
+
valid = y_vals_clipped < pr_boundary(x_vals)
|
1785
|
+
valid_ = y_vals_clipped > 1e-3
|
1786
|
+
valid = valid & valid_
|
1787
|
+
x_vals = x_vals[valid]
|
1736
1788
|
y_vals_clipped = y_vals_clipped[valid]
|
1737
1789
|
if len(x_vals) > 0: # Ensure annotation is placed only if line segment exists
|
1738
1790
|
ax.plot(x_vals, y_vals_clipped, color="gray", alpha=1)
|
1739
|
-
plt.annotate(
|
1740
|
-
|
1791
|
+
plt.annotate(
|
1792
|
+
f"$f_1={f_score:0.1f}$",
|
1793
|
+
xy=(0.8, y_vals_clipped[-int(len(y_vals_clipped) * 0.35)] + 0.02),
|
1794
|
+
)
|
1741
1795
|
|
1742
1796
|
# # Plot the average precision line
|
1743
1797
|
if show_avg_precision:
|
@@ -1757,11 +1811,12 @@ def plot_pr_binary(
|
|
1757
1811
|
ax.grid(False)
|
1758
1812
|
ax.legend(loc=legend_loc)
|
1759
1813
|
return ax
|
1760
|
-
|
1814
|
+
|
1815
|
+
|
1761
1816
|
def plot_cm(
|
1762
1817
|
cm,
|
1763
1818
|
labels_name=None,
|
1764
|
-
thresh=0.8,
|
1819
|
+
thresh=0.8, # for set color
|
1765
1820
|
axis_labels=None,
|
1766
1821
|
cmap="Reds",
|
1767
1822
|
normalize=True,
|
@@ -2048,7 +2103,7 @@ def predict(
|
|
2048
2103
|
y_train: pd.Series,
|
2049
2104
|
x_true: pd.DataFrame = None,
|
2050
2105
|
y_true: Optional[pd.Series] = None,
|
2051
|
-
backward:bool=False,
|
2106
|
+
backward: bool = False, # backward_regression
|
2052
2107
|
common_features: set = None,
|
2053
2108
|
purpose: str = "classification", # 'classification' or 'regression'
|
2054
2109
|
cls: Optional[Dict[str, Any]] = None,
|
@@ -2242,22 +2297,22 @@ def predict(
|
|
2242
2297
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2243
2298
|
# else:
|
2244
2299
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2245
|
-
y_train=pd.DataFrame(y_train)
|
2300
|
+
y_train = pd.DataFrame(y_train)
|
2246
2301
|
if y_train.select_dtypes(include=np.number).empty:
|
2247
|
-
y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
|
2248
|
-
is_binary = False if y_train_.shape[1] >2 else True
|
2302
|
+
y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
|
2303
|
+
is_binary = False if y_train_.shape[1] > 2 else True
|
2249
2304
|
else:
|
2250
|
-
y_train_=ips.flatten(y_train.values)
|
2251
|
-
is_binary = False if len(y_train_)>2 else True
|
2305
|
+
y_train_ = ips.flatten(y_train.values)
|
2306
|
+
is_binary = False if len(y_train_) > 2 else True
|
2252
2307
|
|
2253
2308
|
if is_binary:
|
2254
|
-
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2255
|
-
print(
|
2309
|
+
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2310
|
+
print("is_binary:", is_binary)
|
2256
2311
|
|
2257
2312
|
# Perform backward feature selection
|
2258
2313
|
if backward:
|
2259
2314
|
selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
|
2260
|
-
x_train=x_train[selected_features]
|
2315
|
+
x_train = x_train[selected_features]
|
2261
2316
|
|
2262
2317
|
if x_true is None:
|
2263
2318
|
x_train, x_true, y_train, y_true = train_test_split(
|
@@ -2271,23 +2326,31 @@ def predict(
|
|
2271
2326
|
if isinstance(y_train, str) and y_train in x_train.columns:
|
2272
2327
|
y_train_col_name = y_train
|
2273
2328
|
y_train = x_train[y_train]
|
2274
|
-
y_train =
|
2329
|
+
y_train = (
|
2330
|
+
ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2331
|
+
if is_binary
|
2332
|
+
else y_train
|
2333
|
+
)
|
2275
2334
|
x_train = x_train.drop(y_train_col_name, axis=1)
|
2276
2335
|
if is_binary:
|
2277
2336
|
y_train = ips.df_encoder(
|
2278
2337
|
pd.DataFrame(y_train), method="label"
|
2279
|
-
).values.ravel()
|
2338
|
+
).values.ravel()
|
2280
2339
|
|
2281
2340
|
if y_true is not None:
|
2282
2341
|
if isinstance(y_true, str) and y_true in x_true.columns:
|
2283
2342
|
y_true_col_name = y_true
|
2284
2343
|
y_true = x_true[y_true]
|
2285
|
-
y_true =
|
2286
|
-
|
2344
|
+
y_true = (
|
2345
|
+
ips.df_encoder(pd.DataFrame(y_true), method="label")
|
2346
|
+
if is_binary
|
2347
|
+
else y_true
|
2348
|
+
)
|
2349
|
+
y_true = pd.DataFrame(y_true)
|
2287
2350
|
x_true = x_true.drop(y_true_col_name, axis=1)
|
2288
2351
|
if is_binary:
|
2289
2352
|
y_true = ips.df_encoder(pd.DataFrame(y_true), method="label").values.ravel()
|
2290
|
-
y_true =
|
2353
|
+
y_true = pd.DataFrame(y_true)
|
2291
2354
|
|
2292
2355
|
# to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
|
2293
2356
|
|
@@ -2295,10 +2358,14 @@ def predict(
|
|
2295
2358
|
# y_true=y_true.values.ravel() if y_true is not None else None
|
2296
2359
|
if y_train is not None:
|
2297
2360
|
y_train = (
|
2298
|
-
y_train.ravel()
|
2361
|
+
y_train.ravel()
|
2362
|
+
if isinstance(y_train, np.ndarray)
|
2363
|
+
else y_train.values.ravel()
|
2299
2364
|
)
|
2300
2365
|
if y_true is not None:
|
2301
|
-
y_true =
|
2366
|
+
y_true = (
|
2367
|
+
y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
|
2368
|
+
)
|
2302
2369
|
# Ensure common features are selected
|
2303
2370
|
if common_features is not None:
|
2304
2371
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
@@ -2307,7 +2374,9 @@ def predict(
|
|
2307
2374
|
x_train, x_true = x_train[share_col_names], x_true[share_col_names]
|
2308
2375
|
|
2309
2376
|
x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
|
2310
|
-
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2377
|
+
x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
|
2378
|
+
x_true, method="dummy"
|
2379
|
+
)
|
2311
2380
|
# Handle class imbalance using SMOTE (only for classification)
|
2312
2381
|
if (
|
2313
2382
|
smote
|
@@ -2320,11 +2389,11 @@ def predict(
|
|
2320
2389
|
x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
|
2321
2390
|
if not is_binary:
|
2322
2391
|
if isinstance(y_train, np.ndarray):
|
2323
|
-
y_train = ips.df_encoder(data=pd.DataFrame(y_train),method=
|
2324
|
-
y_train=np.asarray(y_train)
|
2392
|
+
y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
|
2393
|
+
y_train = np.asarray(y_train)
|
2325
2394
|
if isinstance(y_train, np.ndarray):
|
2326
|
-
y_true = ips.df_encoder(data=pd.DataFrame(y_true),method=
|
2327
|
-
y_true=np.asarray(y_true)
|
2395
|
+
y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
|
2396
|
+
y_true = np.asarray(y_true)
|
2328
2397
|
# Hyperparameter grids for tuning
|
2329
2398
|
if cv_level in ["low", "simple", "s", "l"]:
|
2330
2399
|
param_grids = {
|
@@ -2908,14 +2977,16 @@ def predict(
|
|
2908
2977
|
clf,
|
2909
2978
|
param_grid=param_grids.get(name, {}),
|
2910
2979
|
scoring=(
|
2911
|
-
"roc_auc"
|
2980
|
+
"roc_auc"
|
2981
|
+
if purpose == "classification"
|
2982
|
+
else "neg_mean_squared_error"
|
2912
2983
|
),
|
2913
2984
|
cv=cv,
|
2914
2985
|
n_jobs=n_jobs,
|
2915
2986
|
verbose=verbose,
|
2916
2987
|
)
|
2917
2988
|
|
2918
|
-
gs.fit(x_train, y_train)
|
2989
|
+
gs.fit(x_train, y_train)
|
2919
2990
|
best_clf = gs.best_estimator_
|
2920
2991
|
# make sure x_train and x_test has the same name
|
2921
2992
|
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
@@ -2924,7 +2995,9 @@ def predict(
|
|
2924
2995
|
y_pred_proba = best_clf.predict_proba(x_true)
|
2925
2996
|
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
2926
2997
|
if y_pred_proba.shape[1] == 1:
|
2927
|
-
y_pred_proba = np.hstack(
|
2998
|
+
y_pred_proba = np.hstack(
|
2999
|
+
[1 - y_pred_proba, y_pred_proba]
|
3000
|
+
) # Add missing class probabilities
|
2928
3001
|
y_pred_proba = y_pred_proba[:, 1]
|
2929
3002
|
elif hasattr(best_clf, "decision_function"):
|
2930
3003
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
@@ -2940,7 +3013,9 @@ def predict(
|
|
2940
3013
|
clf,
|
2941
3014
|
param_grid=param_grids.get(name, {}),
|
2942
3015
|
scoring=(
|
2943
|
-
"roc_auc_ovr"
|
3016
|
+
"roc_auc_ovr"
|
3017
|
+
if purpose == "classification"
|
3018
|
+
else "neg_mean_squared_error"
|
2944
3019
|
),
|
2945
3020
|
cv=cv,
|
2946
3021
|
n_jobs=n_jobs,
|
@@ -2948,7 +3023,7 @@ def predict(
|
|
2948
3023
|
)
|
2949
3024
|
|
2950
3025
|
# Fit GridSearchCV
|
2951
|
-
gs.fit(x_train, y_train)
|
3026
|
+
gs.fit(x_train, y_train)
|
2952
3027
|
best_clf = gs.best_estimator_
|
2953
3028
|
|
2954
3029
|
# Ensure x_true aligns with x_train columns
|
@@ -2960,14 +3035,18 @@ def predict(
|
|
2960
3035
|
y_pred_proba = best_clf.predict_proba(x_true)
|
2961
3036
|
elif hasattr(best_clf, "decision_function"):
|
2962
3037
|
y_pred_proba = best_clf.decision_function(x_true)
|
2963
|
-
|
3038
|
+
|
2964
3039
|
# Normalize for multiclass if necessary
|
2965
3040
|
if y_pred_proba.ndim == 2:
|
2966
|
-
y_pred_proba = (
|
2967
|
-
|
3041
|
+
y_pred_proba = (
|
3042
|
+
y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
|
3043
|
+
) / (
|
3044
|
+
y_pred_proba.max(axis=1, keepdims=True)
|
3045
|
+
- y_pred_proba.min(axis=1, keepdims=True)
|
3046
|
+
)
|
2968
3047
|
else:
|
2969
3048
|
y_pred_proba = None # No probability output for certain models
|
2970
|
-
|
3049
|
+
|
2971
3050
|
validation_scores = {}
|
2972
3051
|
|
2973
3052
|
if y_true is not None and y_pred_proba is not None:
|
@@ -2985,7 +3064,9 @@ def predict(
|
|
2985
3064
|
if y_pred_proba is not None:
|
2986
3065
|
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
2987
3066
|
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
2988
|
-
lower_ci, upper_ci = cal_auc_ci(
|
3067
|
+
lower_ci, upper_ci = cal_auc_ci(
|
3068
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
3069
|
+
)
|
2989
3070
|
roc_auc = auc(fpr, tpr)
|
2990
3071
|
roc_info = {
|
2991
3072
|
"fpr": fpr.tolist(),
|
@@ -3030,11 +3111,13 @@ def predict(
|
|
3030
3111
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3031
3112
|
),
|
3032
3113
|
}
|
3033
|
-
else:
|
3114
|
+
else: # multi-classes
|
3034
3115
|
if y_pred_proba is not None:
|
3035
3116
|
# fpr, tpr, roc_auc = dict(), dict(), dict()
|
3036
3117
|
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
3037
|
-
confidence_intervals = cal_auc_ci(
|
3118
|
+
confidence_intervals = cal_auc_ci(
|
3119
|
+
y_true, y_pred_proba, verbose=False, is_binary=is_binary
|
3120
|
+
)
|
3038
3121
|
roc_info = {
|
3039
3122
|
"fpr": validation_scores["fpr"],
|
3040
3123
|
"tpr": validation_scores["tpr"],
|
@@ -3042,7 +3125,9 @@ def predict(
|
|
3042
3125
|
"ci95": confidence_intervals,
|
3043
3126
|
}
|
3044
3127
|
# precision-recall curve
|
3045
|
-
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3128
|
+
precision_, recall_, avg_precision_ = cal_precision_recall(
|
3129
|
+
y_true, y_pred_proba, is_binary=is_binary
|
3130
|
+
)
|
3046
3131
|
pr_info = {
|
3047
3132
|
"precision": precision_,
|
3048
3133
|
"recall": recall_,
|
@@ -3080,14 +3165,17 @@ def predict(
|
|
3080
3165
|
}
|
3081
3166
|
|
3082
3167
|
else:
|
3083
|
-
|
3084
|
-
|
3085
|
-
|
3086
|
-
|
3087
|
-
|
3088
|
-
|
3089
|
-
|
3090
|
-
|
3168
|
+
if not y_true:
|
3169
|
+
validation_scores = []
|
3170
|
+
else:
|
3171
|
+
validation_scores = cal_metrics(
|
3172
|
+
y_true,
|
3173
|
+
y_pred,
|
3174
|
+
y_pred_proba=y_pred_proba,
|
3175
|
+
is_binary=is_binary,
|
3176
|
+
purpose=purpose,
|
3177
|
+
average="weighted",
|
3178
|
+
)
|
3091
3179
|
results[name] = {
|
3092
3180
|
"best_clf": gs.best_estimator_,
|
3093
3181
|
"best_params": gs.best_params_,
|
@@ -3096,8 +3184,8 @@ def predict(
|
|
3096
3184
|
"predictions_proba": (
|
3097
3185
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3098
3186
|
),
|
3099
|
-
"y_train":y_train if y_train is not None else [],
|
3100
|
-
"y_true": y_true if y_true is not None else []
|
3187
|
+
"y_train": y_train if y_train is not None else [],
|
3188
|
+
"y_true": y_true if y_true is not None else [],
|
3101
3189
|
}
|
3102
3190
|
|
3103
3191
|
# Convert results to DataFrame
|
@@ -3118,8 +3206,8 @@ def predict(
|
|
3118
3206
|
plot.figsets(xangle=30)
|
3119
3207
|
if dir_save:
|
3120
3208
|
ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
|
3121
|
-
|
3122
|
-
df_scores=df_scores.select_dtypes(include=np.number)
|
3209
|
+
|
3210
|
+
df_scores = df_scores.select_dtypes(include=np.number)
|
3123
3211
|
|
3124
3212
|
if df_scores.shape[0] > 1: # draw cluster
|
3125
3213
|
plot.heatmap(df_scores, kind="direct", cluster=True)
|
@@ -3129,7 +3217,7 @@ def predict(
|
|
3129
3217
|
if all([plot_, y_true is not None, purpose == "classification"]):
|
3130
3218
|
# try:
|
3131
3219
|
if len(models) > 3:
|
3132
|
-
plot_validate_features(df_results,is_binary=is_binary)
|
3220
|
+
plot_validate_features(df_results, is_binary=is_binary)
|
3133
3221
|
else:
|
3134
3222
|
plot_validate_features_single(df_results, is_binary=is_binary)
|
3135
3223
|
if dir_save:
|
@@ -3140,7 +3228,12 @@ def predict(
|
|
3140
3228
|
|
3141
3229
|
|
3142
3230
|
def cal_metrics(
|
3143
|
-
y_true,
|
3231
|
+
y_true,
|
3232
|
+
y_pred,
|
3233
|
+
y_pred_proba=None,
|
3234
|
+
is_binary=True,
|
3235
|
+
purpose="regression",
|
3236
|
+
average="weighted",
|
3144
3237
|
):
|
3145
3238
|
"""
|
3146
3239
|
Calculate regression or classification metrics based on the purpose.
|
@@ -3216,33 +3309,38 @@ def cal_metrics(
|
|
3216
3309
|
tn, fp, fn, tp = cm.ravel()
|
3217
3310
|
else:
|
3218
3311
|
# Handle single-class predictions
|
3219
|
-
tn, fp, fn, tp = 0, 0, 0, 0
|
3312
|
+
tn, fp, fn, tp = 0, 0, 0, 0
|
3220
3313
|
print("Warning: Only one class found in y_pred or y_true.")
|
3221
3314
|
|
3222
3315
|
# Specificity calculation
|
3223
|
-
validation_scores["specificity"] = (
|
3224
|
-
tn / (tn + fp) if (tn + fp) > 0 else 0
|
3225
|
-
)
|
3316
|
+
validation_scores["specificity"] = tn / (tn + fp) if (tn + fp) > 0 else 0
|
3226
3317
|
if y_pred_proba is not None:
|
3227
3318
|
# Calculate ROC-AUC
|
3228
3319
|
validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
|
3229
3320
|
# PR-AUC (Precision-Recall AUC) calculation
|
3230
|
-
validation_scores["pr_auc"] = average_precision_score(
|
3231
|
-
|
3232
|
-
|
3321
|
+
validation_scores["pr_auc"] = average_precision_score(
|
3322
|
+
y_true, y_pred_proba
|
3323
|
+
)
|
3324
|
+
|
3325
|
+
else: # multi-class
|
3233
3326
|
from sklearn.preprocessing import label_binarize
|
3234
|
-
|
3235
|
-
|
3327
|
+
|
3328
|
+
# * Multi-class ROC calculation
|
3329
|
+
y_pred_proba = np.asarray(y_pred_proba)
|
3236
3330
|
classes = np.unique(y_true)
|
3237
3331
|
y_true_bin = label_binarize(y_true, classes=classes)
|
3238
3332
|
if isinstance(y_true, np.ndarray):
|
3239
|
-
y_true = ips.df_encoder(
|
3333
|
+
y_true = ips.df_encoder(
|
3334
|
+
data=pd.DataFrame(y_true), method="dum", prefix="Label"
|
3335
|
+
)
|
3240
3336
|
# Initialize dictionaries to store FPR, TPR, and AUC for each class
|
3241
3337
|
fpr = dict()
|
3242
3338
|
tpr = dict()
|
3243
|
-
roc_auc = dict()
|
3339
|
+
roc_auc = dict()
|
3244
3340
|
for i, class_label in enumerate(classes):
|
3245
|
-
fpr[class_label], tpr[class_label], _ = roc_curve(
|
3341
|
+
fpr[class_label], tpr[class_label], _ = roc_curve(
|
3342
|
+
y_true_bin[:, i], y_pred_proba[:, i]
|
3343
|
+
)
|
3246
3344
|
roc_auc[class_label] = auc(fpr[class_label], tpr[class_label])
|
3247
3345
|
|
3248
3346
|
# Store the mean ROC AUC
|
@@ -3267,6 +3365,7 @@ def cal_metrics(
|
|
3267
3365
|
|
3268
3366
|
return validation_scores
|
3269
3367
|
|
3368
|
+
|
3270
3369
|
def plot_trees(
|
3271
3370
|
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3272
3371
|
):
|
@@ -3303,6 +3402,7 @@ def plot_trees(
|
|
3303
3402
|
ExtraTreesClassifier,
|
3304
3403
|
)
|
3305
3404
|
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
|
3405
|
+
|
3306
3406
|
# Split data for training and testing error calculation
|
3307
3407
|
x_train, x_test, y_train, y_test = train_test_split(
|
3308
3408
|
X, y, test_size=test_size, random_state=random_state
|
@@ -3361,7 +3461,9 @@ def plot_trees(
|
|
3361
3461
|
if validation_error[-early_stopping_rounds:] == sorted(
|
3362
3462
|
validation_error[-early_stopping_rounds:]
|
3363
3463
|
):
|
3364
|
-
print(
|
3464
|
+
print(
|
3465
|
+
f"Early stopping at tree {i} due to lack of improvement in validation error."
|
3466
|
+
)
|
3365
3467
|
break
|
3366
3468
|
|
3367
3469
|
# Plot results
|
@@ -3407,16 +3509,17 @@ def plot_trees(
|
|
3407
3509
|
plt.grid(True)
|
3408
3510
|
plt.show()
|
3409
3511
|
|
3512
|
+
|
3410
3513
|
def img_datasets_preprocessing(
|
3411
3514
|
data: pd.DataFrame,
|
3412
3515
|
x_col: str,
|
3413
|
-
y_col: str=None,
|
3516
|
+
y_col: str = None,
|
3414
3517
|
target_size: tuple = (224, 224),
|
3415
3518
|
batch_size: int = 128,
|
3416
3519
|
class_mode: str = "raw",
|
3417
3520
|
shuffle: bool = False,
|
3418
3521
|
augment: bool = False,
|
3419
|
-
scaler: str =
|
3522
|
+
scaler: str = "normalize", # 'normalize', 'standardize', 'clahe', 'raw'
|
3420
3523
|
grayscale: bool = False,
|
3421
3524
|
encoder: str = "label", # Options: 'label', 'onehot', 'binary'
|
3422
3525
|
label_encoder=None,
|
@@ -3461,16 +3564,29 @@ def img_datasets_preprocessing(
|
|
3461
3564
|
x_col in data.columns and y_col in data.columns
|
3462
3565
|
), "Missing required columns in DataFrame."
|
3463
3566
|
if y_col is None:
|
3464
|
-
class_mode=None
|
3567
|
+
class_mode = None
|
3465
3568
|
# 输出格式
|
3466
|
-
output = ips.strcmp(
|
3467
|
-
|
3468
|
-
|
3469
|
-
|
3569
|
+
output = ips.strcmp(
|
3570
|
+
output,
|
3571
|
+
[
|
3572
|
+
"generator",
|
3573
|
+
"tf",
|
3574
|
+
"iterator",
|
3575
|
+
"transform",
|
3576
|
+
"transformer",
|
3577
|
+
"dataframe",
|
3578
|
+
"df",
|
3579
|
+
"pd",
|
3580
|
+
"pandas",
|
3581
|
+
],
|
3582
|
+
)[0]
|
3583
|
+
|
3470
3584
|
# Handle missing file paths
|
3471
3585
|
if drop_missing:
|
3472
3586
|
data = data[
|
3473
|
-
data[x_col].apply(
|
3587
|
+
data[x_col].apply(
|
3588
|
+
lambda path: os.path.exists(path) and os.path.isfile(path)
|
3589
|
+
)
|
3474
3590
|
]
|
3475
3591
|
|
3476
3592
|
# Encoding labels if necessary
|
@@ -3502,11 +3618,11 @@ def img_datasets_preprocessing(
|
|
3502
3618
|
aug_params.update(kws_augmentation)
|
3503
3619
|
dat = ImageDataGenerator(rescale=scaler, **aug_params)
|
3504
3620
|
dat = ImageDataGenerator(
|
3505
|
-
|
3621
|
+
rescale=1.0 / 255 if scaler == "normalize" else None, **aug_params
|
3622
|
+
)
|
3506
3623
|
|
3507
3624
|
else:
|
3508
|
-
dat = ImageDataGenerator(
|
3509
|
-
rescale=1.0 / 255 if scaler == 'normalize' else None)
|
3625
|
+
dat = ImageDataGenerator(rescale=1.0 / 255 if scaler == "normalize" else None)
|
3510
3626
|
|
3511
3627
|
# Create DataFrameIterator
|
3512
3628
|
data_iterator = dat.flow_from_dataframe(
|
@@ -3529,14 +3645,14 @@ def img_datasets_preprocessing(
|
|
3529
3645
|
|
3530
3646
|
# Load, resize, and process images in batches
|
3531
3647
|
for i, (batch_images, batch_labels) in enumerate(data_iterator):
|
3532
|
-
for img, label in zip(batch_images, batch_labels):
|
3533
|
-
if scaler == [
|
3648
|
+
for img, label in zip(batch_images, batch_labels):
|
3649
|
+
if scaler == ["normalize", "raw"]:
|
3534
3650
|
# Already rescaled by 1.0/255 in ImageDataGenerator
|
3535
3651
|
pass
|
3536
|
-
elif scaler ==
|
3652
|
+
elif scaler == "standardize":
|
3537
3653
|
# Standardize by subtracting mean and dividing by std
|
3538
3654
|
img = (img - np.mean(img)) / np.std(img)
|
3539
|
-
elif scaler ==
|
3655
|
+
elif scaler == "clahe":
|
3540
3656
|
# Apply CLAHE to the image
|
3541
3657
|
img = apply_clahe(img)
|
3542
3658
|
flat_img = img.flatten()
|
@@ -3561,11 +3677,13 @@ def img_datasets_preprocessing(
|
|
3561
3677
|
return df_img
|
3562
3678
|
|
3563
3679
|
|
3564
|
-
def backward_regression(
|
3680
|
+
def backward_regression(
|
3681
|
+
X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
|
3682
|
+
):
|
3565
3683
|
"""
|
3566
3684
|
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
3567
|
-
|
3568
|
-
Evaluates the p-values of all features, which represent the probability of observing a coefficient
|
3685
|
+
|
3686
|
+
Evaluates the p-values of all features, which represent the probability of observing a coefficient
|
3569
3687
|
as extreme as the one calculated if the feature had no true effect on the target.
|
3570
3688
|
|
3571
3689
|
Args:
|
@@ -3576,9 +3694,10 @@ def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_
|
|
3576
3694
|
verbose -- true to produce lots of logging output
|
3577
3695
|
|
3578
3696
|
Returns:
|
3579
|
-
list of selected features for modeling
|
3697
|
+
list of selected features for modeling
|
3580
3698
|
"""
|
3581
3699
|
import statsmodels.api as sm
|
3700
|
+
|
3582
3701
|
if isinstance(y, str) and y in X.columns:
|
3583
3702
|
y_col_name = y
|
3584
3703
|
y = X[y]
|
@@ -3600,15 +3719,16 @@ def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_
|
|
3600
3719
|
break
|
3601
3720
|
print(f"\nSelected Features:\n{included}")
|
3602
3721
|
return included # Returns the list of selected features
|
3603
|
-
|
3722
|
+
|
3604
3723
|
|
3605
3724
|
# Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
3606
3725
|
def apply_clahe(img):
|
3607
3726
|
import cv2
|
3727
|
+
|
3608
3728
|
lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
|
3609
3729
|
l, a, b = cv2.split(lab) # Split into channels
|
3610
3730
|
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
3611
3731
|
cl = clahe.apply(l) # Apply CLAHE to the L channel
|
3612
3732
|
limg = cv2.merge((cl, a, b)) # Merge back the channels
|
3613
3733
|
img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
|
3614
|
-
return img_clahe
|
3734
|
+
return img_clahe
|
@@ -236,7 +236,7 @@ py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
|
|
236
236
|
py2ls/ich2ls.py,sha256=3E9R8oVpyYZXH5PiIQgT3CN5NxLe4Dwtm2LwaeacE6I,21381
|
237
237
|
py2ls/ips.py,sha256=2TWuOSFquwhmPdxkmmvU_pcIbE5M0S9aRPtuQgs5B7A,297706
|
238
238
|
py2ls/ml2ls copy.py,sha256=iZJrFLIrdfTieAY2BDsxQFTm29smwnJh0aC4hRB9VGM,113314
|
239
|
-
py2ls/ml2ls.py,sha256=
|
239
|
+
py2ls/ml2ls.py,sha256=wvQkhcOsBiysgfaRmeT2KAR5C8uFOaX3HeyDA2Oy1LI,146065
|
240
240
|
py2ls/mol.py,sha256=AZnHzarIk_MjueKdChqn1V6e4tUle3X1NnHSFA6n3Nw,10645
|
241
241
|
py2ls/netfinder.py,sha256=R70NkrnO8LlXjT1y7bf2TN-yE4yOeAYhb0jDBiNp8XA,57536
|
242
242
|
py2ls/ocr.py,sha256=5lhUbJufIKRSOL6wAWVLEo8TqMYSjoI_Q-IO-_4u3DE,31419
|
@@ -246,6 +246,6 @@ py2ls/sleep_events_detectors.py,sha256=bQA3HJqv5qnYKJJEIhCyhlDtkXQfIzqksnD0YRXso
|
|
246
246
|
py2ls/stats.py,sha256=qBn2rJmNa_QLLUqjwYqXUlGzqmW94sgA1bxJU2FC3r0,39175
|
247
247
|
py2ls/translator.py,sha256=77Tp_GjmiiwFbEIJD_q3VYpQ43XL9ZeJo6Mhl44mvh8,34284
|
248
248
|
py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
|
249
|
-
py2ls-0.2.4.
|
250
|
-
py2ls-0.2.4.
|
251
|
-
py2ls-0.2.4.
|
249
|
+
py2ls-0.2.4.20.dist-info/METADATA,sha256=iCOFX-A3J17xwkEB2UdDpS5A7kQuRcBCJVq5x8BtqPg,20078
|
250
|
+
py2ls-0.2.4.20.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
251
|
+
py2ls-0.2.4.20.dist-info/RECORD,,
|
File without changes
|