py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.git/index +0 -0
- py2ls/ips.py +724 -12
- py2ls/ml2ls copy.py +2906 -0
- py2ls/ml2ls.py +411 -16
- py2ls/plot.py +409 -24
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/METADATA +2 -1
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/RECORD +8 -7
- {py2ls-0.2.4.15.dist-info → py2ls-0.2.4.17.dist-info}/WHEEL +0 -0
py2ls/ml2ls.py
CHANGED
@@ -506,7 +506,7 @@ def get_models(
|
|
506
506
|
"Support Vector Machine(svm)",
|
507
507
|
"naive bayes",
|
508
508
|
"Linear Discriminant Analysis (lda)",
|
509
|
-
"
|
509
|
+
"AdaBoost",
|
510
510
|
"DecisionTree",
|
511
511
|
"KNeighbors",
|
512
512
|
"Bagging",
|
@@ -585,7 +585,7 @@ def get_features(
|
|
585
585
|
"Support Vector Machine(svm)",
|
586
586
|
"naive bayes",
|
587
587
|
"Linear Discriminant Analysis (lda)",
|
588
|
-
"
|
588
|
+
"AdaBoost",
|
589
589
|
"DecisionTree",
|
590
590
|
"KNeighbors",
|
591
591
|
"Bagging",
|
@@ -699,9 +699,11 @@ def get_features(
|
|
699
699
|
"Support Vector Machine(svm)",
|
700
700
|
"Naive Bayes",
|
701
701
|
"Linear Discriminant Analysis (lda)",
|
702
|
-
"
|
702
|
+
"AdaBoost",
|
703
703
|
]
|
704
704
|
cls = [ips.strcmp(i, cls_)[0] for i in cls]
|
705
|
+
|
706
|
+
feature_importances = {}
|
705
707
|
|
706
708
|
# Lasso Feature Selection
|
707
709
|
lasso_importances = (
|
@@ -712,6 +714,7 @@ def get_features(
|
|
712
714
|
lasso_selected_features = (
|
713
715
|
lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
|
714
716
|
)
|
717
|
+
feature_importances['lasso']=lasso_importances.head(n_features)
|
715
718
|
# Ridge
|
716
719
|
ridge_importances = (
|
717
720
|
features_ridge(x_train, y_train, ridge_params)
|
@@ -721,6 +724,7 @@ def get_features(
|
|
721
724
|
selected_ridge_features = (
|
722
725
|
ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
|
723
726
|
)
|
727
|
+
feature_importances['ridge']=ridge_importances.head(n_features)
|
724
728
|
# Elastic Net
|
725
729
|
enet_importances = (
|
726
730
|
features_enet(x_train, y_train, enet_params)
|
@@ -730,6 +734,7 @@ def get_features(
|
|
730
734
|
selected_enet_features = (
|
731
735
|
enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
|
732
736
|
)
|
737
|
+
feature_importances['Enet']=enet_importances.head(n_features)
|
733
738
|
# Random Forest Feature Importance
|
734
739
|
rf_importances = (
|
735
740
|
features_rf(x_train, y_train, rf_params)
|
@@ -741,6 +746,7 @@ def get_features(
|
|
741
746
|
if "Random Forest" in cls
|
742
747
|
else []
|
743
748
|
)
|
749
|
+
feature_importances['Random Forest']=rf_importances.head(n_features)
|
744
750
|
# Gradient Boosting Feature Importance
|
745
751
|
gb_importances = (
|
746
752
|
features_gradient_boosting(x_train, y_train, gb_params)
|
@@ -752,6 +758,7 @@ def get_features(
|
|
752
758
|
if "Gradient Boosting" in cls
|
753
759
|
else []
|
754
760
|
)
|
761
|
+
feature_importances['Gradient Boosting']=gb_importances.head(n_features)
|
755
762
|
# xgb
|
756
763
|
xgb_importances = (
|
757
764
|
features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
|
@@ -759,6 +766,7 @@ def get_features(
|
|
759
766
|
top_xgb_features = (
|
760
767
|
xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
|
761
768
|
)
|
769
|
+
feature_importances['xgb']=xgb_importances.head(n_features)
|
762
770
|
|
763
771
|
# SVM with RFE
|
764
772
|
selected_svm_features = (
|
@@ -773,6 +781,7 @@ def get_features(
|
|
773
781
|
selected_lda_features = (
|
774
782
|
lda_importances.head(n_features)["feature"].values if "lda" in cls else []
|
775
783
|
)
|
784
|
+
feature_importances['lda']=lda_importances.head(n_features)
|
776
785
|
# AdaBoost Feature Importance
|
777
786
|
adaboost_importances = (
|
778
787
|
features_adaboost(x_train, y_train, adaboost_params)
|
@@ -784,6 +793,7 @@ def get_features(
|
|
784
793
|
if "AdaBoost" in cls
|
785
794
|
else []
|
786
795
|
)
|
796
|
+
feature_importances['AdaBoost']=adaboost_importances.head(n_features)
|
787
797
|
# Decision Tree Feature Importance
|
788
798
|
dt_importances = (
|
789
799
|
features_decision_tree(x_train, y_train, dt_params)
|
@@ -794,7 +804,8 @@ def get_features(
|
|
794
804
|
dt_importances.head(n_features)["feature"].values
|
795
805
|
if "Decision Tree" in cls
|
796
806
|
else []
|
797
|
-
)
|
807
|
+
)
|
808
|
+
feature_importances['Decision Tree']=dt_importances.head(n_features)
|
798
809
|
# Bagging Feature Importance
|
799
810
|
bagging_importances = (
|
800
811
|
features_bagging(x_train, y_train, bagging_params)
|
@@ -806,6 +817,7 @@ def get_features(
|
|
806
817
|
if "Bagging" in cls
|
807
818
|
else []
|
808
819
|
)
|
820
|
+
feature_importances['Bagging']=bagging_importances.head(n_features)
|
809
821
|
# KNN Feature Importance via Permutation
|
810
822
|
knn_importances = (
|
811
823
|
features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
|
@@ -813,6 +825,7 @@ def get_features(
|
|
813
825
|
top_knn_features = (
|
814
826
|
knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
|
815
827
|
)
|
828
|
+
feature_importances['KNN']=knn_importances.head(n_features)
|
816
829
|
|
817
830
|
#! Find common features
|
818
831
|
common_features = ips.shared(
|
@@ -915,6 +928,7 @@ def get_features(
|
|
915
928
|
"cv_train_scores": cv_train_results_df,
|
916
929
|
"cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
|
917
930
|
"common_features": list(common_features),
|
931
|
+
"feature_importances":feature_importances
|
918
932
|
}
|
919
933
|
if all([plot_, dir_save]):
|
920
934
|
from datetime import datetime
|
@@ -927,6 +941,7 @@ def get_features(
|
|
927
941
|
"cv_train_scores": pd.DataFrame(),
|
928
942
|
"cv_test_scores": pd.DataFrame(),
|
929
943
|
"common_features": [],
|
944
|
+
"feature_importances":{}
|
930
945
|
}
|
931
946
|
print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
|
932
947
|
return results
|
@@ -2033,6 +2048,7 @@ def predict(
|
|
2033
2048
|
y_train: pd.Series,
|
2034
2049
|
x_true: pd.DataFrame = None,
|
2035
2050
|
y_true: Optional[pd.Series] = None,
|
2051
|
+
backward:bool=False, # backward_regression
|
2036
2052
|
common_features: set = None,
|
2037
2053
|
purpose: str = "classification", # 'classification' or 'regression'
|
2038
2054
|
cls: Optional[Dict[str, Any]] = None,
|
@@ -2227,11 +2243,21 @@ def predict(
|
|
2227
2243
|
# else:
|
2228
2244
|
# y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
|
2229
2245
|
y_train=pd.DataFrame(y_train)
|
2230
|
-
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2246
|
+
if y_train.select_dtypes(include=np.number).empty:
|
2247
|
+
y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
|
2248
|
+
is_binary = False if y_train_.shape[1] >2 else True
|
2249
|
+
else:
|
2250
|
+
y_train_=ips.flatten(y_train.values)
|
2251
|
+
is_binary = False if len(y_train_)>2 else True
|
2252
|
+
|
2253
|
+
if is_binary:
|
2254
|
+
y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
|
2255
|
+
print('is_binary:',is_binary)
|
2256
|
+
|
2257
|
+
# Perform backward feature selection
|
2258
|
+
if backward:
|
2259
|
+
selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
|
2260
|
+
x_train=x_train[selected_features]
|
2235
2261
|
|
2236
2262
|
if x_true is None:
|
2237
2263
|
x_train, x_true, y_train, y_true = train_test_split(
|
@@ -2267,10 +2293,12 @@ def predict(
|
|
2267
2293
|
|
2268
2294
|
# y_train=y_train.values.ravel() if y_train is not None else None
|
2269
2295
|
# y_true=y_true.values.ravel() if y_true is not None else None
|
2270
|
-
y_train
|
2271
|
-
y_train
|
2272
|
-
|
2273
|
-
|
2296
|
+
if y_train is not None:
|
2297
|
+
y_train = (
|
2298
|
+
y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
|
2299
|
+
)
|
2300
|
+
if y_true is not None:
|
2301
|
+
y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
|
2274
2302
|
# Ensure common features are selected
|
2275
2303
|
if common_features is not None:
|
2276
2304
|
x_train, x_true = x_train[common_features], x_true[common_features]
|
@@ -2893,7 +2921,11 @@ def predict(
|
|
2893
2921
|
x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
|
2894
2922
|
y_pred = best_clf.predict(x_true)
|
2895
2923
|
if hasattr(best_clf, "predict_proba"):
|
2896
|
-
y_pred_proba = best_clf.predict_proba(x_true)
|
2924
|
+
y_pred_proba = best_clf.predict_proba(x_true)
|
2925
|
+
print("Shape of predicted probabilities:", y_pred_proba.shape)
|
2926
|
+
if y_pred_proba.shape[1] == 1:
|
2927
|
+
y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba]) # Add missing class probabilities
|
2928
|
+
y_pred_proba = y_pred_proba[:, 1]
|
2897
2929
|
elif hasattr(best_clf, "decision_function"):
|
2898
2930
|
# If predict_proba is not available, use decision_function (e.g., for SVM)
|
2899
2931
|
y_pred_proba = best_clf.decision_function(x_true)
|
@@ -3048,6 +3080,14 @@ def predict(
|
|
3048
3080
|
}
|
3049
3081
|
|
3050
3082
|
else:
|
3083
|
+
validation_scores = cal_metrics(
|
3084
|
+
y_true,
|
3085
|
+
y_pred,
|
3086
|
+
y_pred_proba=y_pred_proba,
|
3087
|
+
is_binary=is_binary,
|
3088
|
+
purpose=purpose,
|
3089
|
+
average="weighted",
|
3090
|
+
)
|
3051
3091
|
results[name] = {
|
3052
3092
|
"best_clf": gs.best_estimator_,
|
3053
3093
|
"best_params": gs.best_params_,
|
@@ -3056,6 +3096,8 @@ def predict(
|
|
3056
3096
|
"predictions_proba": (
|
3057
3097
|
y_pred_proba.tolist() if y_pred_proba is not None else None
|
3058
3098
|
),
|
3099
|
+
"y_train":y_train if y_train is not None else [],
|
3100
|
+
"y_true": y_true if y_true is not None else []
|
3059
3101
|
}
|
3060
3102
|
|
3061
3103
|
# Convert results to DataFrame
|
@@ -3078,7 +3120,7 @@ def predict(
|
|
3078
3120
|
ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
|
3079
3121
|
|
3080
3122
|
df_scores=df_scores.select_dtypes(include=np.number)
|
3081
|
-
|
3123
|
+
|
3082
3124
|
if df_scores.shape[0] > 1: # draw cluster
|
3083
3125
|
plot.heatmap(df_scores, kind="direct", cluster=True)
|
3084
3126
|
plot.figsets(xangle=30)
|
@@ -3169,7 +3211,14 @@ def cal_metrics(
|
|
3169
3211
|
|
3170
3212
|
# Confusion matrix to calculate specificity
|
3171
3213
|
if is_binary:
|
3172
|
-
|
3214
|
+
cm = confusion_matrix(y_true, y_pred)
|
3215
|
+
if cm.size == 4:
|
3216
|
+
tn, fp, fn, tp = cm.ravel()
|
3217
|
+
else:
|
3218
|
+
# Handle single-class predictions
|
3219
|
+
tn, fp, fn, tp = 0, 0, 0, 0
|
3220
|
+
print("Warning: Only one class found in y_pred or y_true.")
|
3221
|
+
|
3173
3222
|
# Specificity calculation
|
3174
3223
|
validation_scores["specificity"] = (
|
3175
3224
|
tn / (tn + fp) if (tn + fp) > 0 else 0
|
@@ -3217,3 +3266,349 @@ def cal_metrics(
|
|
3217
3266
|
)
|
3218
3267
|
|
3219
3268
|
return validation_scores
|
3269
|
+
|
3270
|
+
def plot_trees(
|
3271
|
+
X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
|
3272
|
+
):
|
3273
|
+
"""
|
3274
|
+
# # Example usage:
|
3275
|
+
# X = np.random.rand(100, 10) # Example data with 100 samples and 10 features
|
3276
|
+
# y = np.random.randint(0, 2, 100) # Example binary target
|
3277
|
+
# # Using the function with different classifiers
|
3278
|
+
# # Random Forest example
|
3279
|
+
# plot_trees(X, y, RandomForestClassifier(), max_trees=100)
|
3280
|
+
# # Gradient Boosting with early stopping example
|
3281
|
+
# plot_trees(X, y, GradientBoostingClassifier(), max_trees=100, early_stopping_rounds=10)
|
3282
|
+
# # Extra Trees example
|
3283
|
+
# plot_trees(X, y, ExtraTreesClassifier(), max_trees=100)
|
3284
|
+
Master function to plot error rates (OOB, training, and testing) for different tree-based ensemble classifiers.
|
3285
|
+
|
3286
|
+
Parameters:
|
3287
|
+
- X (array-like): Feature matrix.
|
3288
|
+
- y (array-like): Target labels.
|
3289
|
+
- cls (object): Tree-based ensemble classifier instance (e.g., RandomForestClassifier()).
|
3290
|
+
- max_trees (int): Maximum number of trees to evaluate. Default is 500.
|
3291
|
+
- test_size (float): Proportion of data to use as test set for testing error. Default is 0.2.
|
3292
|
+
- random_state (int): Random state for reproducibility. Default is 42.
|
3293
|
+
- early_stopping_rounds (int): For boosting models only, stops training if validation error doesn't improve after specified rounds.
|
3294
|
+
|
3295
|
+
Returns:
|
3296
|
+
- None
|
3297
|
+
"""
|
3298
|
+
from sklearn.model_selection import train_test_split
|
3299
|
+
from sklearn.metrics import accuracy_score
|
3300
|
+
from sklearn.ensemble import (
|
3301
|
+
RandomForestClassifier,
|
3302
|
+
BaggingClassifier,
|
3303
|
+
ExtraTreesClassifier,
|
3304
|
+
)
|
3305
|
+
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
|
3306
|
+
# Split data for training and testing error calculation
|
3307
|
+
x_train, x_test, y_train, y_test = train_test_split(
|
3308
|
+
X, y, test_size=test_size, random_state=random_state
|
3309
|
+
)
|
3310
|
+
|
3311
|
+
# Initialize lists to store error rates
|
3312
|
+
oob_error_rate = []
|
3313
|
+
train_error_rate = []
|
3314
|
+
test_error_rate = []
|
3315
|
+
validation_error = None
|
3316
|
+
|
3317
|
+
# Configure classifier based on type
|
3318
|
+
oob_enabled = False # Default to no OOB error unless explicitly set
|
3319
|
+
|
3320
|
+
if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
|
3321
|
+
# Enable OOB if cls supports it and is using bootstrapping
|
3322
|
+
cls.set_params(warm_start=True, n_estimators=1)
|
3323
|
+
if hasattr(cls, "oob_score"):
|
3324
|
+
cls.set_params(bootstrap=True, oob_score=True)
|
3325
|
+
oob_enabled = True
|
3326
|
+
elif isinstance(cls, BaggingClassifier):
|
3327
|
+
cls.set_params(warm_start=True, bootstrap=True, oob_score=True, n_estimators=1)
|
3328
|
+
oob_enabled = True
|
3329
|
+
elif isinstance(cls, (AdaBoostClassifier, GradientBoostingClassifier)):
|
3330
|
+
cls.set_params(n_estimators=1)
|
3331
|
+
oob_enabled = False
|
3332
|
+
if early_stopping_rounds:
|
3333
|
+
validation_error = []
|
3334
|
+
|
3335
|
+
# Train and evaluate with an increasing number of trees
|
3336
|
+
for i in range(1, max_trees + 1):
|
3337
|
+
cls.set_params(n_estimators=i)
|
3338
|
+
cls.fit(x_train, y_train)
|
3339
|
+
|
3340
|
+
# Calculate OOB error (for models that support it)
|
3341
|
+
if oob_enabled and hasattr(cls, "oob_score_") and cls.oob_score:
|
3342
|
+
oob_error = 1 - cls.oob_score_
|
3343
|
+
oob_error_rate.append(oob_error)
|
3344
|
+
|
3345
|
+
# Calculate training error
|
3346
|
+
train_error = 1 - accuracy_score(y_train, cls.predict(x_train))
|
3347
|
+
train_error_rate.append(train_error)
|
3348
|
+
|
3349
|
+
# Calculate testing error
|
3350
|
+
test_error = 1 - accuracy_score(y_test, cls.predict(x_test))
|
3351
|
+
test_error_rate.append(test_error)
|
3352
|
+
|
3353
|
+
# For boosting models, use validation error with early stopping
|
3354
|
+
if early_stopping_rounds and isinstance(
|
3355
|
+
cls, (AdaBoostClassifier, GradientBoostingClassifier)
|
3356
|
+
):
|
3357
|
+
val_error = 1 - accuracy_score(y_test, cls.predict(x_test))
|
3358
|
+
validation_error.append(val_error)
|
3359
|
+
if len(validation_error) > early_stopping_rounds:
|
3360
|
+
# Stop if validation error has not improved in early_stopping_rounds
|
3361
|
+
if validation_error[-early_stopping_rounds:] == sorted(
|
3362
|
+
validation_error[-early_stopping_rounds:]
|
3363
|
+
):
|
3364
|
+
print(f"Early stopping at tree {i} due to lack of improvement in validation error.")
|
3365
|
+
break
|
3366
|
+
|
3367
|
+
# Plot results
|
3368
|
+
plt.figure(figsize=(10, 6))
|
3369
|
+
if oob_error_rate:
|
3370
|
+
plt.plot(
|
3371
|
+
range(1, len(oob_error_rate) + 1),
|
3372
|
+
oob_error_rate,
|
3373
|
+
color="black",
|
3374
|
+
label="OOB Error Rate",
|
3375
|
+
linewidth=2,
|
3376
|
+
)
|
3377
|
+
if train_error_rate:
|
3378
|
+
plt.plot(
|
3379
|
+
range(1, len(train_error_rate) + 1),
|
3380
|
+
train_error_rate,
|
3381
|
+
linestyle="dotted",
|
3382
|
+
color="green",
|
3383
|
+
label="Training Error Rate",
|
3384
|
+
)
|
3385
|
+
if test_error_rate:
|
3386
|
+
plt.plot(
|
3387
|
+
range(1, len(test_error_rate) + 1),
|
3388
|
+
test_error_rate,
|
3389
|
+
linestyle="dashed",
|
3390
|
+
color="red",
|
3391
|
+
label="Testing Error Rate",
|
3392
|
+
)
|
3393
|
+
if validation_error:
|
3394
|
+
plt.plot(
|
3395
|
+
range(1, len(validation_error) + 1),
|
3396
|
+
validation_error,
|
3397
|
+
linestyle="solid",
|
3398
|
+
color="blue",
|
3399
|
+
label="Validation Error (Boosting)",
|
3400
|
+
)
|
3401
|
+
|
3402
|
+
# Customize plot
|
3403
|
+
plt.xlabel("Number of Trees")
|
3404
|
+
plt.ylabel("Error Rate")
|
3405
|
+
plt.title(f"Error Rate Analysis for {cls.__class__.__name__}")
|
3406
|
+
plt.legend(loc="upper right")
|
3407
|
+
plt.grid(True)
|
3408
|
+
plt.show()
|
3409
|
+
|
3410
|
+
def img_datasets_preprocessing(
|
3411
|
+
data: pd.DataFrame,
|
3412
|
+
x_col: str,
|
3413
|
+
y_col: str=None,
|
3414
|
+
target_size: tuple = (224, 224),
|
3415
|
+
batch_size: int = 128,
|
3416
|
+
class_mode: str = "raw",
|
3417
|
+
shuffle: bool = False,
|
3418
|
+
augment: bool = False,
|
3419
|
+
scaler: str = 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
|
3420
|
+
grayscale: bool = False,
|
3421
|
+
encoder: str = "label", # Options: 'label', 'onehot', 'binary'
|
3422
|
+
label_encoder=None,
|
3423
|
+
kws_augmentation: dict = None,
|
3424
|
+
verbose: bool = True,
|
3425
|
+
drop_missing: bool = True,
|
3426
|
+
output="df", # "iterator":data_iterator,'df':return DataFrame
|
3427
|
+
):
|
3428
|
+
"""
|
3429
|
+
Enhanced preprocessing function for loading and preparing image data from a DataFrame.
|
3430
|
+
|
3431
|
+
Parameters:
|
3432
|
+
- df (pd.DataFrame): Input DataFrame with image paths and labels.
|
3433
|
+
- x_col (str): Column in `df` containing image file paths.
|
3434
|
+
- y_col (str): Column in `df` containing image labels.
|
3435
|
+
- target_size (tuple): Desired image size in (height, width).
|
3436
|
+
- batch_size (int): Number of images per batch.
|
3437
|
+
- class_mode (str): Mode of label ('raw', 'categorical', 'binary').
|
3438
|
+
- shuffle (bool): Shuffle the images in the DataFrame.
|
3439
|
+
- augment (bool): Apply data augmentation.
|
3440
|
+
- scaler (str): 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
|
3441
|
+
- grayscale (bool): Convert images to grayscale.
|
3442
|
+
- normalize (bool): Normalize image data to [0, 1] range.
|
3443
|
+
- encoder (str): Label encoder method ('label', 'onehot', 'binary').
|
3444
|
+
- label_encoder: Optional pre-defined label encoder.
|
3445
|
+
- kws_augmentation (dict): Parameters for data augmentation.
|
3446
|
+
- verbose (bool): Print status messages.
|
3447
|
+
- drop_missing (bool): Drop rows with missing or invalid image paths.
|
3448
|
+
|
3449
|
+
Returns:
|
3450
|
+
- pd.DataFrame: DataFrame with flattened image pixels and 'Label' column.
|
3451
|
+
"""
|
3452
|
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
3453
|
+
from tensorflow.keras.utils import to_categorical
|
3454
|
+
from sklearn.preprocessing import LabelEncoder
|
3455
|
+
from PIL import Image
|
3456
|
+
import os
|
3457
|
+
|
3458
|
+
# Validate input DataFrame for required columns
|
3459
|
+
if y_col:
|
3460
|
+
assert (
|
3461
|
+
x_col in data.columns and y_col in data.columns
|
3462
|
+
), "Missing required columns in DataFrame."
|
3463
|
+
if y_col is None:
|
3464
|
+
class_mode=None
|
3465
|
+
# 输出格式
|
3466
|
+
output = ips.strcmp(output,[
|
3467
|
+
"generator","tf","iterator","transform","transformer","dataframe",
|
3468
|
+
"df","pd","pandas"])[0]
|
3469
|
+
|
3470
|
+
# Handle missing file paths
|
3471
|
+
if drop_missing:
|
3472
|
+
data = data[
|
3473
|
+
data[x_col].apply(lambda path: os.path.exists(path) and os.path.isfile(path))
|
3474
|
+
]
|
3475
|
+
|
3476
|
+
# Encoding labels if necessary
|
3477
|
+
if encoder and y_col is not None:
|
3478
|
+
if encoder == "binary":
|
3479
|
+
data[y_col] = (data[y_col] == data[y_col].unique()[0]).astype(int)
|
3480
|
+
elif encoder == "onehot":
|
3481
|
+
if not label_encoder:
|
3482
|
+
label_encoder = LabelEncoder()
|
3483
|
+
data[y_col] = label_encoder.fit_transform(data[y_col])
|
3484
|
+
data[y_col] = to_categorical(data[y_col])
|
3485
|
+
elif encoder == "label":
|
3486
|
+
if not label_encoder:
|
3487
|
+
label_encoder = LabelEncoder()
|
3488
|
+
data[y_col] = label_encoder.fit_transform(data[y_col])
|
3489
|
+
|
3490
|
+
# Set up data augmentation
|
3491
|
+
if augment:
|
3492
|
+
aug_params = {
|
3493
|
+
"rotation_range": 20,
|
3494
|
+
"width_shift_range": 0.2,
|
3495
|
+
"height_shift_range": 0.2,
|
3496
|
+
"shear_range": 0.2,
|
3497
|
+
"zoom_range": 0.2,
|
3498
|
+
"horizontal_flip": True,
|
3499
|
+
"fill_mode": "nearest",
|
3500
|
+
}
|
3501
|
+
if kws_augmentation:
|
3502
|
+
aug_params.update(kws_augmentation)
|
3503
|
+
dat = ImageDataGenerator(rescale=scaler, **aug_params)
|
3504
|
+
dat = ImageDataGenerator(
|
3505
|
+
rescale=1.0 / 255 if scaler == 'normalize' else None, **aug_params)
|
3506
|
+
|
3507
|
+
else:
|
3508
|
+
dat = ImageDataGenerator(
|
3509
|
+
rescale=1.0 / 255 if scaler == 'normalize' else None)
|
3510
|
+
|
3511
|
+
# Create DataFrameIterator
|
3512
|
+
data_iterator = dat.flow_from_dataframe(
|
3513
|
+
dataframe=data,
|
3514
|
+
x_col=x_col,
|
3515
|
+
y_col=y_col,
|
3516
|
+
target_size=target_size,
|
3517
|
+
color_mode="grayscale" if grayscale else "rgb",
|
3518
|
+
batch_size=batch_size,
|
3519
|
+
class_mode=class_mode,
|
3520
|
+
shuffle=shuffle,
|
3521
|
+
)
|
3522
|
+
print(f"target_size:{target_size}")
|
3523
|
+
if output.lower() in ["generator", "tf", "iterator", "transform", "transformer"]:
|
3524
|
+
return data_iterator
|
3525
|
+
elif output.lower() in ["dataframe", "df", "pd", "pandas"]:
|
3526
|
+
# Initialize list to collect processed data
|
3527
|
+
data_list = []
|
3528
|
+
total_batches = data_iterator.n // batch_size
|
3529
|
+
|
3530
|
+
# Load, resize, and process images in batches
|
3531
|
+
for i, (batch_images, batch_labels) in enumerate(data_iterator):
|
3532
|
+
for img, label in zip(batch_images, batch_labels):
|
3533
|
+
if scaler == ['normalize','raw']:
|
3534
|
+
# Already rescaled by 1.0/255 in ImageDataGenerator
|
3535
|
+
pass
|
3536
|
+
elif scaler == 'standardize':
|
3537
|
+
# Standardize by subtracting mean and dividing by std
|
3538
|
+
img = (img - np.mean(img)) / np.std(img)
|
3539
|
+
elif scaler == 'clahe':
|
3540
|
+
# Apply CLAHE to the image
|
3541
|
+
img = apply_clahe(img)
|
3542
|
+
flat_img = img.flatten()
|
3543
|
+
data_list.append(np.append(flat_img, label))
|
3544
|
+
|
3545
|
+
# Stop when all images have been processed
|
3546
|
+
if i >= total_batches:
|
3547
|
+
break
|
3548
|
+
|
3549
|
+
# Define column names for flattened image data
|
3550
|
+
pixel_count = target_size[0] * target_size[1] * (1 if grayscale else 3)
|
3551
|
+
column_names = [f"pixel_{i}" for i in range(pixel_count)] + ["Label"]
|
3552
|
+
|
3553
|
+
# Create DataFrame from flattened data
|
3554
|
+
df_img = pd.DataFrame(data_list, columns=column_names)
|
3555
|
+
|
3556
|
+
if verbose:
|
3557
|
+
print("Processed images:", len(df_img))
|
3558
|
+
print("Final DataFrame shape:", df_img.shape)
|
3559
|
+
display(df_img.head())
|
3560
|
+
|
3561
|
+
return df_img
|
3562
|
+
|
3563
|
+
|
3564
|
+
def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_out=0.05, verbose=True):
|
3565
|
+
"""
|
3566
|
+
# awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
|
3567
|
+
|
3568
|
+
Evaluates the p-values of all features, which represent the probability of observing a coefficient
|
3569
|
+
as extreme as the one calculated if the feature had no true effect on the target.
|
3570
|
+
|
3571
|
+
Args:
|
3572
|
+
X -- features values
|
3573
|
+
y -- target variable
|
3574
|
+
initial_list -- features header
|
3575
|
+
threshold_out -- pvalue threshold of features to drop
|
3576
|
+
verbose -- true to produce lots of logging output
|
3577
|
+
|
3578
|
+
Returns:
|
3579
|
+
list of selected features for modeling
|
3580
|
+
"""
|
3581
|
+
import statsmodels.api as sm
|
3582
|
+
if isinstance(y, str) and y in X.columns:
|
3583
|
+
y_col_name = y
|
3584
|
+
y = X[y]
|
3585
|
+
X = X.drop(y_col_name, axis=1)
|
3586
|
+
included = list(X.columns)
|
3587
|
+
while True:
|
3588
|
+
changed = False
|
3589
|
+
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
|
3590
|
+
# exclude the intercept for p-value checking
|
3591
|
+
pvalues = model.pvalues.iloc[1:]
|
3592
|
+
worst_pval = pvalues.max()
|
3593
|
+
if worst_pval > threshold_out:
|
3594
|
+
changed = True
|
3595
|
+
worst_feature = pvalues.idxmax()
|
3596
|
+
included.remove(worst_feature)
|
3597
|
+
if verbose:
|
3598
|
+
print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
|
3599
|
+
if not changed:
|
3600
|
+
break
|
3601
|
+
print(f"\nSelected Features:\n{included}")
|
3602
|
+
return included # Returns the list of selected features
|
3603
|
+
|
3604
|
+
|
3605
|
+
# Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
3606
|
+
def apply_clahe(img):
|
3607
|
+
import cv2
|
3608
|
+
lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
|
3609
|
+
l, a, b = cv2.split(lab) # Split into channels
|
3610
|
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
3611
|
+
cl = clahe.apply(l) # Apply CLAHE to the L channel
|
3612
|
+
limg = cv2.merge((cl, a, b)) # Merge back the channels
|
3613
|
+
img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
|
3614
|
+
return img_clahe
|