py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/ips.py +724 -12
- py2ls/ml2ls copy.py +2906 -0
- py2ls/ml2ls.py +411 -16
- py2ls/plot.py +409 -24
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/METADATA +2 -1
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/RECORD +8 -7
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -506,7 +506,7 @@ def get_models(
|
|
506
506
|
"Support Vector Machine(svm)",
|
507
507
|
"naive bayes",
|
508
508
|
"Linear Discriminant Analysis (lda)",
|
509
|
-
"
|
509
|
+
"AdaBoost",
|
510
510
|
"DecisionTree",
|
511
511
|
"KNeighbors",
|
512
512
|
"Bagging",
|
@@ -585,7 +585,7 @@ def get_features(
|
|
585
585
|
"Support Vector Machine(svm)",
|
586
586
|
"naive bayes",
|
587
587
|
"Linear Discriminant Analysis (lda)",
|
588
|
-
"
|
588
|
+
"AdaBoost",
|
589
589
|
"DecisionTree",
|
590
590
|
"KNeighbors",
|
591
591
|
"Bagging",
|
@@ -699,9 +699,11 @@ def get_features(
|
|
699
699
|
"Support Vector Machine(svm)",
|
700
700
|
"Naive Bayes",
|
701
701
|
"Linear Discriminant Analysis (lda)",
|
702
|
-
"
|
702
|
+
"AdaBoost",
|
703
703
|
]
|
704
704
|
cls = [ips.strcmp(i, cls_)[0] for i in cls]
|
705
|
+
|
706
|
+
feature_importances = {}
|
705
707
|
|
706
708
|
# Lasso Feature Selection
|
707
709
|
lasso_importances = (
|
@@ -712,6 +714,7 @@ def get_features(
|
|
712
714
|
lasso_selected_features = (
|
713
715
|
lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
|
714
716
|
)
|
717
|
+
feature_importances['lasso']=lasso_importances.head(n_features)
|
715
718
|
# Ridge
|
716
719
|
ridge_importances = (
|
717
720
|
features_ridge(x_train, y_train, ridge_params)
|
@@ -721,6 +724,7 @@ def get_features(
|
|
721
724
|
selected_ridge_features = (
|
722
725
|
ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
|
723
726
|
)
|
727
|
+
feature_importances['ridge']=ridge_importances.head(n_features)
|
724
728
|
# Elastic Net
|
725
729
|
enet_importances = (
|
726
730
|
features_enet(x_train, y_train, enet_params)
|
@@ -730,6 +734,7 @@ def get_features(
|
|
730
734
|
selected_enet_features = (
|
731
735
|
enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
|
732
736
|
)
|
737
|
+
feature_importances['Enet']=enet_importances.head(n_features)
|
733
738
|
# Random Forest Feature Importance
|
734
739
|
rf_importances = (
|
735
740
|
features_rf(x_train, y_train, rf_params)
|
@@ -741,6 +746,7 @@ def get_features(
|
|
741
746
|
if "Random Forest" in cls
|
742
747
|
else []
|
743
748
|
)
|
749
|
+
feature_importances['Random Forest']=rf_importances.head(n_features)
|
744
750
|
# Gradient Boosting Feature Importance
|
745
751
|
gb_importances = (
|
746
752
|
features_gradient_boosting(x_train, y_train, gb_params)
|
@@ -752,6 +758,7 @@ def get_features(
|
|
752
758
|
if "Gradient Boosting" in cls
|
753
759
|
else []
|
754
760
|
)
|
761
|
+
feature_importances['Gradient Boosting']=gb_importances.head(n_features)
|
755
762
|
# xgb
|
756
763
|
xgb_importances = (
|
757
764
|
features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
|
@@ -759,6 +766,7 @@ def get_features(
|
|
759
766
|
top_xgb_features = (
|
760
767
|
xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
|
761
768
|
)
|
769
|
+
feature_importances['xgb']=xgb_importances.head(n_features)
|
762
770
|
|
763
771
|
# SVM with RFE
|
764
772
|
selected_svm_features = (
|
@@ -773,6 +781,7 @@ def get_features(
|
|
773
781
|
selected_lda_features = (
|
774
782
|
lda_importances.head(n_features)["feature"].values if "lda" in cls else []
|
775
783
|
)
|
784
|
+
feature_importances['lda']=lda_importances.head(n_features)
|
776
785
|
# AdaBoost Feature Importance
|
777
786
|
adaboost_importances = (
|
778
787
|
features_adaboost(x_train, y_train, adaboost_params)
|
@@ -784,6 +793,7 @@ def get_features(
|
|
784
793
|
if "AdaBoost" in cls
|
785
794
|
else []
|
786
795
|
)
|
796
|
+
feature_importances['AdaBoost']=adaboost_importances.head(n_features)
|
787
797
|
# Decision Tree Feature Importance
|
788
798
|
dt_importances = (
|
789
799
|
features_decision_tree(x_train, y_train, dt_params)
|
@@ -794,7 +804,8 @@ def get_features(
|
|
794
804
|
dt_importances.head(n_features)["feature"].values
|
795
805
|
if "Decision Tree" in cls
|
796
806
|
else []
|
797
|
-
)
|
807
|
+
)
|
808
|
+
feature_importances['Decision Tree']=dt_importances.head(n_features)
|
798
809
|
# Bagging Feature Importance
|
799
810
|
bagging_importances = (
|
800
811
|
features_bagging(x_train, y_train, bagging_params)
|
@@ -806,6 +817,7 @@ def get_features(
|
|
806
817
|
if "Bagging" in cls
|
807
818
|
else []
|
808
819
|
)
|
820
|
+
feature_importances['Bagging']=bagging_importances.head(n_features)
|
809
821
|
# KNN Feature Importance via Permutation
|
810
822
|
knn_importances = (
|
811
823
|
features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
|
@@ -813,6 +825,7 @@ def get_features(
|
|
813
825
|
top_knn_features = (
|
814
826
|
knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
|
815
827
|
)
|
828
|
+
feature_importances['KNN']=knn_importances.head(n_features)
|
816
829
|
|
817
830
|
#! Find common features
|
818
831
|
common_features = ips.shared(
|
@@ -915,6 +928,7 @@ def get_features(
|
|
915
928
|
"cv_train_scores": cv_train_results_df,
|
916
929
|
"cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
|
917
930
|
"common_features": list(common_features),
|
931
|
+
"feature_importances":feature_importances
|
918
932
|
}
|
919
933
|
if all([plot_, dir_save]):
|
920
934
|
from datetime import datetime
|
@@ -927,6 +941,7 @@ def get_features(
|
|
927
941
|
"cv_train_scores": pd.DataFrame(),
|
928
942
|
"cv_test_scores": pd.DataFrame(),
|
929
943
|
"common_features": [],
|
944
|
+
"feature_importances":{}
|
930
945
|
}
|
931
946
|
print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
|
932
947
|
return results
|
@@ -2033,6 +2048,7 @@ def predict(
|
|
2033
2048
|
y_train: pd.Series,
|
2034
2049
|
x_true: pd.DataFrame = None,
|
2035
2050
|
y_true: Optional[pd.Series] = None,
|
2051
|
+
backward:bool=False, # backward_regression
|
2036
2052
|
common_features: set = None,
|
2037
2053
|
purpose: str = "classification", # 'classification' or 'regression'
|
2038
2054
|
cls: Optional[Dict[str, Any]] = None,
|
@@ -2227,11 +2243,21 @@ def predict(
|
|
2227
2243
|
# else:
|
2228
2244
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2229
2245
|
y_train=pd.DataFrame(y_train)
|
2230
|
-
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2246
|
+
if y_train.select_dtypes(include=np.number).empty:
|
2247
|
+
y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
|
2248
|
+
is_binary = False if y_train_.shape[1] >2 else True
|
2249
|
+
else:
|
2250
|
+
y_train_=ips.flatten(y_train.values)
|
2251
|
+
is_binary = False if len(y_train_)>2 else True
|
2252
|
+
|
2253
|
+
if is_binary:
|
2254
|
+
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2255
|
+
print('is_binary:',is_binary)
|
2256
|
+
|
2257
|
+
# Perform backward feature selection
|
2258
|
+
if backward:
|
2259
|
+
selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
|
2260
|
+
x_train=x_train[selected_features]
|
2235
2261
|
|
2236
2262
|
if x_true is None:
|
2237
2263
|
x_train, x_true, y_train, y_true = train_test_split(
|
@@ -2267,10 +2293,12 @@ def predict(
|
|
2267
2293
|
|
2268
2294
|
# y_train=y_train.values.ravel() if y_train is not None else None
|
2269
2295
|
# y_true=y_true.values.ravel() if y_true is not None else None
|
2270
|
-
y_train
|
2271
|
-
y_train
|
2272
|
-
|
2273
|
-
|
2296
|
+
if y_train is not None:
|
2297
|
+
y_train = (
|
2298
|
+
y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
|
2299
|
+
)
|
2300
|
+
if y_true is not None:
|
2301
|
+
y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
|
2274
2302
|
# Ensure common features are selected
|
2275
2303
|
if common_features is not None:
|
2276
2304
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
@@ -2893,7 +2921,11 @@ def predict(
|
|
2893
2921
|
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2894
2922
|
y_pred = best_clf.predict(x_true)
|
2895
2923
|
if hasattr(best_clf, "predict_proba"):
|
2896
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
2924
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
2925
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
2926
|
+
if y_pred_proba.shape[1] == 1:
|
2927
|
+
y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba]) # Add missing class probabilities
|
2928
|
+
y_pred_proba = y_pred_proba[:, 1]
|
2897
2929
|
elif hasattr(best_clf, "decision_function"):
|
2898
2930
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
2899
2931
|
y_pred_proba = best_clf.decision_function(x_true)
|
@@ -3048,6 +3080,14 @@ def predict(
|
|
3048
3080
|
}
|
3049
3081
|
|
3050
3082
|
else:
|
3083
|
+
validation_scores = cal_metrics(
|
3084
|
+
y_true,
|
3085
|
+
y_pred,
|
3086
|
+
y_pred_proba=y_pred_proba,
|
3087
|
+
is_binary=is_binary,
|
3088
|
+
purpose=purpose,
|
3089
|
+
average="weighted",
|
3090
|
+
)
|
3051
3091
|
results[name] = {
|
3052
3092
|
"best_clf": gs.best_estimator_,
|
3053
3093
|
"best_params": gs.best_params_,
|
@@ -3056,6 +3096,8 @@ def predict(
|
|
3056
3096
|
"predictions_proba": (
|
3057
3097
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3058
3098
|
),
|
3099
|
+
"y_train":y_train if y_train is not None else [],
|
3100
|
+
"y_true": y_true if y_true is not None else []
|
3059
3101
|
}
|
3060
3102
|
|
3061
3103
|
# Convert results to DataFrame
|
@@ -3078,7 +3120,7 @@ def predict(
|
|
3078
3120
|
ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
|
3079
3121
|
|
3080
3122
|
df_scores=df_scores.select_dtypes(include=np.number)
|
3081
|
-
|
3123
|
+
|
3082
3124
|
if df_scores.shape[0] > 1: # draw cluster
|
3083
3125
|
plot.heatmap(df_scores, kind="direct", cluster=True)
|
3084
3126
|
plot.figsets(xangle=30)
|
@@ -3169,7 +3211,14 @@ def cal_metrics(
|
|
3169
3211
|
|
3170
3212
|
# Confusion matrix to calculate specificity
|
3171
3213
|
if is_binary:
|
3172
|
-
|
3214
|
+
cm = confusion_matrix(y_true, y_pred)
|
3215
|
+
if cm.size == 4:
|
3216
|
+
tn, fp, fn, tp = cm.ravel()
|
3217
|
+
else:
|
3218
|
+
# Handle single-class predictions
|
3219
|
+
tn, fp, fn, tp = 0, 0, 0, 0
|
3220
|
+
print("Warning: Only one class found in y_pred or y_true.")
|
3221
|
+
|
3173
3222
|
# Specificity calculation
|
3174
3223
|
validation_scores["specificity"] = (
|
3175
3224
|
tn / (tn + fp) if (tn + fp) > 0 else 0
|
@@ -3217,3 +3266,349 @@ def cal_metrics(
|
|
3217
3266
|
)
|
3218
3267
|
|
3219
3268
|
return validation_scores
|
3269
|
+
|
3270
|
+
def plot_trees(
|
3271
|
+
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3272
|
+
):
|
3273
|
+
"""
|
3274
|
+
# # Example usage:
|
3275
|
+
# X = np.random.rand(100, 10) # Example data with 100 samples and 10 features
|
3276
|
+
# y = np.random.randint(0, 2, 100) # Example binary target
|
3277
|
+
# # Using the function with different classifiers
|
3278
|
+
# # Random Forest example
|
3279
|
+
# plot_trees(X, y, RandomForestClassifier(), max_trees=100)
|
3280
|
+
# # Gradient Boosting with early stopping example
|
3281
|
+
# plot_trees(X, y, GradientBoostingClassifier(), max_trees=100, early_stopping_rounds=10)
|
3282
|
+
# # Extra Trees example
|
3283
|
+
# plot_trees(X, y, ExtraTreesClassifier(), max_trees=100)
|
3284
|
+
Master function to plot error rates (OOB, training, and testing) for different tree-based ensemble classifiers.
|
3285
|
+
|
3286
|
+
Parameters:
|
3287
|
+
- X (array-like): Feature matrix.
|
3288
|
+
- y (array-like): Target labels.
|
3289
|
+
- cls (object): Tree-based ensemble classifier instance (e.g., RandomForestClassifier()).
|
3290
|
+
- max_trees (int): Maximum number of trees to evaluate. Default is 500.
|
3291
|
+
- test_size (float): Proportion of data to use as test set for testing error. Default is 0.2.
|
3292
|
+
- random_state (int): Random state for reproducibility. Default is 42.
|
3293
|
+
- early_stopping_rounds (int): For boosting models only, stops training if validation error doesn't improve after specified rounds.
|
3294
|
+
|
3295
|
+
Returns:
|
3296
|
+
- None
|
3297
|
+
"""
|
3298
|
+
from sklearn.model_selection import train_test_split
|
3299
|
+
from sklearn.metrics import accuracy_score
|
3300
|
+
from sklearn.ensemble import (
|
3301
|
+
RandomForestClassifier,
|
3302
|
+
BaggingClassifier,
|
3303
|
+
ExtraTreesClassifier,
|
3304
|
+
)
|
3305
|
+
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
|
3306
|
+
# Split data for training and testing error calculation
|
3307
|
+
x_train, x_test, y_train, y_test = train_test_split(
|
3308
|
+
X, y, test_size=test_size, random_state=random_state
|
3309
|
+
)
|
3310
|
+
|
3311
|
+
# Initialize lists to store error rates
|
3312
|
+
oob_error_rate = []
|
3313
|
+
train_error_rate = []
|
3314
|
+
test_error_rate = []
|
3315
|
+
validation_error = None
|
3316
|
+
|
3317
|
+
# Configure classifier based on type
|
3318
|
+
oob_enabled = False # Default to no OOB error unless explicitly set
|
3319
|
+
|
3320
|
+
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3321
|
+
# Enable OOB if cls supports it and is using bootstrapping
|
3322
|
+
cls.set_params(warm_start=True, n_estimators=1)
|
3323
|
+
if hasattr(cls, "oob_score"):
|
3324
|
+
cls.set_params(bootstrap=True, oob_score=True)
|
3325
|
+
oob_enabled = True
|
3326
|
+
elif isinstance(cls, BaggingClassifier):
|
3327
|
+
cls.set_params(warm_start=True, bootstrap=True, oob_score=True, n_estimators=1)
|
3328
|
+
oob_enabled = True
|
3329
|
+
elif isinstance(cls, (AdaBoostClassifier, GradientBoostingClassifier)):
|
3330
|
+
cls.set_params(n_estimators=1)
|
3331
|
+
oob_enabled = False
|
3332
|
+
if early_stopping_rounds:
|
3333
|
+
validation_error = []
|
3334
|
+
|
3335
|
+
# Train and evaluate with an increasing number of trees
|
3336
|
+
for i in range(1, max_trees + 1):
|
3337
|
+
cls.set_params(n_estimators=i)
|
3338
|
+
cls.fit(x_train, y_train)
|
3339
|
+
|
3340
|
+
# Calculate OOB error (for models that support it)
|
3341
|
+
if oob_enabled and hasattr(cls, "oob_score_") and cls.oob_score:
|
3342
|
+
oob_error = 1 - cls.oob_score_
|
3343
|
+
oob_error_rate.append(oob_error)
|
3344
|
+
|
3345
|
+
# Calculate training error
|
3346
|
+
train_error = 1 - accuracy_score(y_train, cls.predict(x_train))
|
3347
|
+
train_error_rate.append(train_error)
|
3348
|
+
|
3349
|
+
# Calculate testing error
|
3350
|
+
test_error = 1 - accuracy_score(y_test, cls.predict(x_test))
|
3351
|
+
test_error_rate.append(test_error)
|
3352
|
+
|
3353
|
+
# For boosting models, use validation error with early stopping
|
3354
|
+
if early_stopping_rounds and isinstance(
|
3355
|
+
cls, (AdaBoostClassifier, GradientBoostingClassifier)
|
3356
|
+
):
|
3357
|
+
val_error = 1 - accuracy_score(y_test, cls.predict(x_test))
|
3358
|
+
validation_error.append(val_error)
|
3359
|
+
if len(validation_error) > early_stopping_rounds:
|
3360
|
+
# Stop if validation error has not improved in early_stopping_rounds
|
3361
|
+
if validation_error[-early_stopping_rounds:] == sorted(
|
3362
|
+
validation_error[-early_stopping_rounds:]
|
3363
|
+
):
|
3364
|
+
print(f"Early stopping at tree {i} due to lack of improvement in validation error.")
|
3365
|
+
break
|
3366
|
+
|
3367
|
+
# Plot results
|
3368
|
+
plt.figure(figsize=(10, 6))
|
3369
|
+
if oob_error_rate:
|
3370
|
+
plt.plot(
|
3371
|
+
range(1, len(oob_error_rate) + 1),
|
3372
|
+
oob_error_rate,
|
3373
|
+
color="black",
|
3374
|
+
label="OOB Error Rate",
|
3375
|
+
linewidth=2,
|
3376
|
+
)
|
3377
|
+
if train_error_rate:
|
3378
|
+
plt.plot(
|
3379
|
+
range(1, len(train_error_rate) + 1),
|
3380
|
+
train_error_rate,
|
3381
|
+
linestyle="dotted",
|
3382
|
+
color="green",
|
3383
|
+
label="Training Error Rate",
|
3384
|
+
)
|
3385
|
+
if test_error_rate:
|
3386
|
+
plt.plot(
|
3387
|
+
range(1, len(test_error_rate) + 1),
|
3388
|
+
test_error_rate,
|
3389
|
+
linestyle="dashed",
|
3390
|
+
color="red",
|
3391
|
+
label="Testing Error Rate",
|
3392
|
+
)
|
3393
|
+
if validation_error:
|
3394
|
+
plt.plot(
|
3395
|
+
range(1, len(validation_error) + 1),
|
3396
|
+
validation_error,
|
3397
|
+
linestyle="solid",
|
3398
|
+
color="blue",
|
3399
|
+
label="Validation Error (Boosting)",
|
3400
|
+
)
|
3401
|
+
|
3402
|
+
# Customize plot
|
3403
|
+
plt.xlabel("Number of Trees")
|
3404
|
+
plt.ylabel("Error Rate")
|
3405
|
+
plt.title(f"Error Rate Analysis for {cls.__class__.__name__}")
|
3406
|
+
plt.legend(loc="upper right")
|
3407
|
+
plt.grid(True)
|
3408
|
+
plt.show()
|
3409
|
+
|
3410
|
+
def img_datasets_preprocessing(
|
3411
|
+
data: pd.DataFrame,
|
3412
|
+
x_col: str,
|
3413
|
+
y_col: str=None,
|
3414
|
+
target_size: tuple = (224, 224),
|
3415
|
+
batch_size: int = 128,
|
3416
|
+
class_mode: str = "raw",
|
3417
|
+
shuffle: bool = False,
|
3418
|
+
augment: bool = False,
|
3419
|
+
scaler: str = 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
|
3420
|
+
grayscale: bool = False,
|
3421
|
+
encoder: str = "label", # Options: 'label', 'onehot', 'binary'
|
3422
|
+
label_encoder=None,
|
3423
|
+
kws_augmentation: dict = None,
|
3424
|
+
verbose: bool = True,
|
3425
|
+
drop_missing: bool = True,
|
3426
|
+
output="df", # "iterator":data_iterator,'df':return DataFrame
|
3427
|
+
):
|
3428
|
+
"""
|
3429
|
+
Enhanced preprocessing function for loading and preparing image data from a DataFrame.
|
3430
|
+
|
3431
|
+
Parameters:
|
3432
|
+
- df (pd.DataFrame): Input DataFrame with image paths and labels.
|
3433
|
+
- x_col (str): Column in `df` containing image file paths.
|
3434
|
+
- y_col (str): Column in `df` containing image labels.
|
3435
|
+
- target_size (tuple): Desired image size in (height, width).
|
3436
|
+
- batch_size (int): Number of images per batch.
|
3437
|
+
- class_mode (str): Mode of label ('raw', 'categorical', 'binary').
|
3438
|
+
- shuffle (bool): Shuffle the images in the DataFrame.
|
3439
|
+
- augment (bool): Apply data augmentation.
|
3440
|
+
- scaler (str): 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
|
3441
|
+
- grayscale (bool): Convert images to grayscale.
|
3442
|
+
- normalize (bool): Normalize image data to [0, 1] range.
|
3443
|
+
- encoder (str): Label encoder method ('label', 'onehot', 'binary').
|
3444
|
+
- label_encoder: Optional pre-defined label encoder.
|
3445
|
+
- kws_augmentation (dict): Parameters for data augmentation.
|
3446
|
+
- verbose (bool): Print status messages.
|
3447
|
+
- drop_missing (bool): Drop rows with missing or invalid image paths.
|
3448
|
+
|
3449
|
+
Returns:
|
3450
|
+
- pd.DataFrame: DataFrame with flattened image pixels and 'Label' column.
|
3451
|
+
"""
|
3452
|
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
3453
|
+
from tensorflow.keras.utils import to_categorical
|
3454
|
+
from sklearn.preprocessing import LabelEncoder
|
3455
|
+
from PIL import Image
|
3456
|
+
import os
|
3457
|
+
|
3458
|
+
# Validate input DataFrame for required columns
|
3459
|
+
if y_col:
|
3460
|
+
assert (
|
3461
|
+
x_col in data.columns and y_col in data.columns
|
3462
|
+
), "Missing required columns in DataFrame."
|
3463
|
+
if y_col is None:
|
3464
|
+
class_mode=None
|
3465
|
+
# 输出格式
|
3466
|
+
output = ips.strcmp(output,[
|
3467
|
+
"generator","tf","iterator","transform","transformer","dataframe",
|
3468
|
+
"df","pd","pandas"])[0]
|
3469
|
+
|
3470
|
+
# Handle missing file paths
|
3471
|
+
if drop_missing:
|
3472
|
+
data = data[
|
3473
|
+
data[x_col].apply(lambda path: os.path.exists(path) and os.path.isfile(path))
|
3474
|
+
]
|
3475
|
+
|
3476
|
+
# Encoding labels if necessary
|
3477
|
+
if encoder and y_col is not None:
|
3478
|
+
if encoder == "binary":
|
3479
|
+
data[y_col] = (data[y_col] == data[y_col].unique()[0]).astype(int)
|
3480
|
+
elif encoder == "onehot":
|
3481
|
+
if not label_encoder:
|
3482
|
+
label_encoder = LabelEncoder()
|
3483
|
+
data[y_col] = label_encoder.fit_transform(data[y_col])
|
3484
|
+
data[y_col] = to_categorical(data[y_col])
|
3485
|
+
elif encoder == "label":
|
3486
|
+
if not label_encoder:
|
3487
|
+
label_encoder = LabelEncoder()
|
3488
|
+
data[y_col] = label_encoder.fit_transform(data[y_col])
|
3489
|
+
|
3490
|
+
# Set up data augmentation
|
3491
|
+
if augment:
|
3492
|
+
aug_params = {
|
3493
|
+
"rotation_range": 20,
|
3494
|
+
"width_shift_range": 0.2,
|
3495
|
+
"height_shift_range": 0.2,
|
3496
|
+
"shear_range": 0.2,
|
3497
|
+
"zoom_range": 0.2,
|
3498
|
+
"horizontal_flip": True,
|
3499
|
+
"fill_mode": "nearest",
|
3500
|
+
}
|
3501
|
+
if kws_augmentation:
|
3502
|
+
aug_params.update(kws_augmentation)
|
3503
|
+
dat = ImageDataGenerator(rescale=scaler, **aug_params)
|
3504
|
+
dat = ImageDataGenerator(
|
3505
|
+
rescale=1.0 / 255 if scaler == 'normalize' else None, **aug_params)
|
3506
|
+
|
3507
|
+
else:
|
3508
|
+
dat = ImageDataGenerator(
|
3509
|
+
rescale=1.0 / 255 if scaler == 'normalize' else None)
|
3510
|
+
|
3511
|
+
# Create DataFrameIterator
|
3512
|
+
data_iterator = dat.flow_from_dataframe(
|
3513
|
+
dataframe=data,
|
3514
|
+
x_col=x_col,
|
3515
|
+
y_col=y_col,
|
3516
|
+
target_size=target_size,
|
3517
|
+
color_mode="grayscale" if grayscale else "rgb",
|
3518
|
+
batch_size=batch_size,
|
3519
|
+
class_mode=class_mode,
|
3520
|
+
shuffle=shuffle,
|
3521
|
+
)
|
3522
|
+
print(f"target_size:{target_size}")
|
3523
|
+
if output.lower() in ["generator", "tf", "iterator", "transform", "transformer"]:
|
3524
|
+
return data_iterator
|
3525
|
+
elif output.lower() in ["dataframe", "df", "pd", "pandas"]:
|
3526
|
+
# Initialize list to collect processed data
|
3527
|
+
data_list = []
|
3528
|
+
total_batches = data_iterator.n // batch_size
|
3529
|
+
|
3530
|
+
# Load, resize, and process images in batches
|
3531
|
+
for i, (batch_images, batch_labels) in enumerate(data_iterator):
|
3532
|
+
for img, label in zip(batch_images, batch_labels):
|
3533
|
+
if scaler == ['normalize','raw']:
|
3534
|
+
# Already rescaled by 1.0/255 in ImageDataGenerator
|
3535
|
+
pass
|
3536
|
+
elif scaler == 'standardize':
|
3537
|
+
# Standardize by subtracting mean and dividing by std
|
3538
|
+
img = (img - np.mean(img)) / np.std(img)
|
3539
|
+
elif scaler == 'clahe':
|
3540
|
+
# Apply CLAHE to the image
|
3541
|
+
img = apply_clahe(img)
|
3542
|
+
flat_img = img.flatten()
|
3543
|
+
data_list.append(np.append(flat_img, label))
|
3544
|
+
|
3545
|
+
# Stop when all images have been processed
|
3546
|
+
if i >= total_batches:
|
3547
|
+
break
|
3548
|
+
|
3549
|
+
# Define column names for flattened image data
|
3550
|
+
pixel_count = target_size[0] * target_size[1] * (1 if grayscale else 3)
|
3551
|
+
column_names = [f"pixel_{i}" for i in range(pixel_count)] + ["Label"]
|
3552
|
+
|
3553
|
+
# Create DataFrame from flattened data
|
3554
|
+
df_img = pd.DataFrame(data_list, columns=column_names)
|
3555
|
+
|
3556
|
+
if verbose:
|
3557
|
+
print("Processed images:", len(df_img))
|
3558
|
+
print("Final DataFrame shape:", df_img.shape)
|
3559
|
+
display(df_img.head())
|
3560
|
+
|
3561
|
+
return df_img
|
3562
|
+
|
3563
|
+
|
3564
|
+
def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_out=0.05, verbose=True):
|
3565
|
+
"""
|
3566
|
+
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
3567
|
+
|
3568
|
+
Evaluates the p-values of all features, which represent the probability of observing a coefficient
|
3569
|
+
as extreme as the one calculated if the feature had no true effect on the target.
|
3570
|
+
|
3571
|
+
Args:
|
3572
|
+
X -- features values
|
3573
|
+
y -- target variable
|
3574
|
+
initial_list -- features header
|
3575
|
+
threshold_out -- pvalue threshold of features to drop
|
3576
|
+
verbose -- true to produce lots of logging output
|
3577
|
+
|
3578
|
+
Returns:
|
3579
|
+
list of selected features for modeling
|
3580
|
+
"""
|
3581
|
+
import statsmodels.api as sm
|
3582
|
+
if isinstance(y, str) and y in X.columns:
|
3583
|
+
y_col_name = y
|
3584
|
+
y = X[y]
|
3585
|
+
X = X.drop(y_col_name, axis=1)
|
3586
|
+
included = list(X.columns)
|
3587
|
+
while True:
|
3588
|
+
changed = False
|
3589
|
+
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3590
|
+
# exclude the intercept for p-value checking
|
3591
|
+
pvalues = model.pvalues.iloc[1:]
|
3592
|
+
worst_pval = pvalues.max()
|
3593
|
+
if worst_pval > threshold_out:
|
3594
|
+
changed = True
|
3595
|
+
worst_feature = pvalues.idxmax()
|
3596
|
+
included.remove(worst_feature)
|
3597
|
+
if verbose:
|
3598
|
+
print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
|
3599
|
+
if not changed:
|
3600
|
+
break
|
3601
|
+
print(f"\nSelected Features:\n{included}")
|
3602
|
+
return included # Returns the list of selected features
|
3603
|
+
|
3604
|
+
|
3605
|
+
# Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
3606
|
+
def apply_clahe(img):
|
3607
|
+
import cv2
|
3608
|
+
lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
|
3609
|
+
l, a, b = cv2.split(lab) # Split into channels
|
3610
|
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
3611
|
+
cl = clahe.apply(l) # Apply CLAHE to the L channel
|
3612
|
+
limg = cv2.merge((cl, a, b)) # Merge back the channels
|
3613
|
+
img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
|
3614
|
+
return img_clahe
|