py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ml2ls.py CHANGED
@@ -506,7 +506,7 @@ def get_models(
506
506
  "Support Vector Machine(svm)",
507
507
  "naive bayes",
508
508
  "Linear Discriminant Analysis (lda)",
509
- "adaboost",
509
+ "AdaBoost",
510
510
  "DecisionTree",
511
511
  "KNeighbors",
512
512
  "Bagging",
@@ -585,7 +585,7 @@ def get_features(
585
585
  "Support Vector Machine(svm)",
586
586
  "naive bayes",
587
587
  "Linear Discriminant Analysis (lda)",
588
- "adaboost",
588
+ "AdaBoost",
589
589
  "DecisionTree",
590
590
  "KNeighbors",
591
591
  "Bagging",
@@ -699,9 +699,11 @@ def get_features(
699
699
  "Support Vector Machine(svm)",
700
700
  "Naive Bayes",
701
701
  "Linear Discriminant Analysis (lda)",
702
- "adaboost",
702
+ "AdaBoost",
703
703
  ]
704
704
  cls = [ips.strcmp(i, cls_)[0] for i in cls]
705
+
706
+ feature_importances = {}
705
707
 
706
708
  # Lasso Feature Selection
707
709
  lasso_importances = (
@@ -712,6 +714,7 @@ def get_features(
712
714
  lasso_selected_features = (
713
715
  lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
714
716
  )
717
+ feature_importances['lasso']=lasso_importances.head(n_features)
715
718
  # Ridge
716
719
  ridge_importances = (
717
720
  features_ridge(x_train, y_train, ridge_params)
@@ -721,6 +724,7 @@ def get_features(
721
724
  selected_ridge_features = (
722
725
  ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
723
726
  )
727
+ feature_importances['ridge']=ridge_importances.head(n_features)
724
728
  # Elastic Net
725
729
  enet_importances = (
726
730
  features_enet(x_train, y_train, enet_params)
@@ -730,6 +734,7 @@ def get_features(
730
734
  selected_enet_features = (
731
735
  enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
732
736
  )
737
+ feature_importances['Enet']=enet_importances.head(n_features)
733
738
  # Random Forest Feature Importance
734
739
  rf_importances = (
735
740
  features_rf(x_train, y_train, rf_params)
@@ -741,6 +746,7 @@ def get_features(
741
746
  if "Random Forest" in cls
742
747
  else []
743
748
  )
749
+ feature_importances['Random Forest']=rf_importances.head(n_features)
744
750
  # Gradient Boosting Feature Importance
745
751
  gb_importances = (
746
752
  features_gradient_boosting(x_train, y_train, gb_params)
@@ -752,6 +758,7 @@ def get_features(
752
758
  if "Gradient Boosting" in cls
753
759
  else []
754
760
  )
761
+ feature_importances['Gradient Boosting']=gb_importances.head(n_features)
755
762
  # xgb
756
763
  xgb_importances = (
757
764
  features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
@@ -759,6 +766,7 @@ def get_features(
759
766
  top_xgb_features = (
760
767
  xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
761
768
  )
769
+ feature_importances['xgb']=xgb_importances.head(n_features)
762
770
 
763
771
  # SVM with RFE
764
772
  selected_svm_features = (
@@ -773,6 +781,7 @@ def get_features(
773
781
  selected_lda_features = (
774
782
  lda_importances.head(n_features)["feature"].values if "lda" in cls else []
775
783
  )
784
+ feature_importances['lda']=lda_importances.head(n_features)
776
785
  # AdaBoost Feature Importance
777
786
  adaboost_importances = (
778
787
  features_adaboost(x_train, y_train, adaboost_params)
@@ -784,6 +793,7 @@ def get_features(
784
793
  if "AdaBoost" in cls
785
794
  else []
786
795
  )
796
+ feature_importances['AdaBoost']=adaboost_importances.head(n_features)
787
797
  # Decision Tree Feature Importance
788
798
  dt_importances = (
789
799
  features_decision_tree(x_train, y_train, dt_params)
@@ -794,7 +804,8 @@ def get_features(
794
804
  dt_importances.head(n_features)["feature"].values
795
805
  if "Decision Tree" in cls
796
806
  else []
797
- )
807
+ )
808
+ feature_importances['Decision Tree']=dt_importances.head(n_features)
798
809
  # Bagging Feature Importance
799
810
  bagging_importances = (
800
811
  features_bagging(x_train, y_train, bagging_params)
@@ -806,6 +817,7 @@ def get_features(
806
817
  if "Bagging" in cls
807
818
  else []
808
819
  )
820
+ feature_importances['Bagging']=bagging_importances.head(n_features)
809
821
  # KNN Feature Importance via Permutation
810
822
  knn_importances = (
811
823
  features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
@@ -813,6 +825,7 @@ def get_features(
813
825
  top_knn_features = (
814
826
  knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
815
827
  )
828
+ feature_importances['KNN']=knn_importances.head(n_features)
816
829
 
817
830
  #! Find common features
818
831
  common_features = ips.shared(
@@ -915,6 +928,7 @@ def get_features(
915
928
  "cv_train_scores": cv_train_results_df,
916
929
  "cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
917
930
  "common_features": list(common_features),
931
+ "feature_importances":feature_importances
918
932
  }
919
933
  if all([plot_, dir_save]):
920
934
  from datetime import datetime
@@ -927,6 +941,7 @@ def get_features(
927
941
  "cv_train_scores": pd.DataFrame(),
928
942
  "cv_test_scores": pd.DataFrame(),
929
943
  "common_features": [],
944
+ "feature_importances":{}
930
945
  }
931
946
  print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
932
947
  return results
@@ -2033,6 +2048,7 @@ def predict(
2033
2048
  y_train: pd.Series,
2034
2049
  x_true: pd.DataFrame = None,
2035
2050
  y_true: Optional[pd.Series] = None,
2051
+ backward:bool=False, # backward_regression
2036
2052
  common_features: set = None,
2037
2053
  purpose: str = "classification", # 'classification' or 'regression'
2038
2054
  cls: Optional[Dict[str, Any]] = None,
@@ -2227,11 +2243,21 @@ def predict(
2227
2243
  # else:
2228
2244
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2229
2245
  y_train=pd.DataFrame(y_train)
2230
- y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
2231
- is_binary = False if y_train_.shape[1] >2 else True
2232
-
2233
- # if is_binary:
2234
- # y_train = ips.df_encoder(pd.DataFrame(y_train), method="label").values.ravel()
2246
+ if y_train.select_dtypes(include=np.number).empty:
2247
+ y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
2248
+ is_binary = False if y_train_.shape[1] >2 else True
2249
+ else:
2250
+ y_train_=ips.flatten(y_train.values)
2251
+ is_binary = False if len(y_train_)>2 else True
2252
+
2253
+ if is_binary:
2254
+ y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2255
+ print('is_binary:',is_binary)
2256
+
2257
+ # Perform backward feature selection
2258
+ if backward:
2259
+ selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2260
+ x_train=x_train[selected_features]
2235
2261
 
2236
2262
  if x_true is None:
2237
2263
  x_train, x_true, y_train, y_true = train_test_split(
@@ -2267,10 +2293,12 @@ def predict(
2267
2293
 
2268
2294
  # y_train=y_train.values.ravel() if y_train is not None else None
2269
2295
  # y_true=y_true.values.ravel() if y_true is not None else None
2270
- y_train = (
2271
- y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
2272
- )
2273
- y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
2296
+ if y_train is not None:
2297
+ y_train = (
2298
+ y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
2299
+ )
2300
+ if y_true is not None:
2301
+ y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
2274
2302
  # Ensure common features are selected
2275
2303
  if common_features is not None:
2276
2304
  x_train, x_true = x_train[common_features], x_true[common_features]
@@ -2893,7 +2921,11 @@ def predict(
2893
2921
  x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2894
2922
  y_pred = best_clf.predict(x_true)
2895
2923
  if hasattr(best_clf, "predict_proba"):
2896
- y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
2924
+ y_pred_proba = best_clf.predict_proba(x_true)
2925
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
2926
+ if y_pred_proba.shape[1] == 1:
2927
+ y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba]) # Add missing class probabilities
2928
+ y_pred_proba = y_pred_proba[:, 1]
2897
2929
  elif hasattr(best_clf, "decision_function"):
2898
2930
  # If predict_proba is not available, use decision_function (e.g., for SVM)
2899
2931
  y_pred_proba = best_clf.decision_function(x_true)
@@ -3048,6 +3080,14 @@ def predict(
3048
3080
  }
3049
3081
 
3050
3082
  else:
3083
+ validation_scores = cal_metrics(
3084
+ y_true,
3085
+ y_pred,
3086
+ y_pred_proba=y_pred_proba,
3087
+ is_binary=is_binary,
3088
+ purpose=purpose,
3089
+ average="weighted",
3090
+ )
3051
3091
  results[name] = {
3052
3092
  "best_clf": gs.best_estimator_,
3053
3093
  "best_params": gs.best_params_,
@@ -3056,6 +3096,8 @@ def predict(
3056
3096
  "predictions_proba": (
3057
3097
  y_pred_proba.tolist() if y_pred_proba is not None else None
3058
3098
  ),
3099
+ "y_train":y_train if y_train is not None else [],
3100
+ "y_true": y_true if y_true is not None else []
3059
3101
  }
3060
3102
 
3061
3103
  # Convert results to DataFrame
@@ -3078,7 +3120,7 @@ def predict(
3078
3120
  ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
3079
3121
 
3080
3122
  df_scores=df_scores.select_dtypes(include=np.number)
3081
- display(df_scores)
3123
+
3082
3124
  if df_scores.shape[0] > 1: # draw cluster
3083
3125
  plot.heatmap(df_scores, kind="direct", cluster=True)
3084
3126
  plot.figsets(xangle=30)
@@ -3169,7 +3211,14 @@ def cal_metrics(
3169
3211
 
3170
3212
  # Confusion matrix to calculate specificity
3171
3213
  if is_binary:
3172
- tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
3214
+ cm = confusion_matrix(y_true, y_pred)
3215
+ if cm.size == 4:
3216
+ tn, fp, fn, tp = cm.ravel()
3217
+ else:
3218
+ # Handle single-class predictions
3219
+ tn, fp, fn, tp = 0, 0, 0, 0
3220
+ print("Warning: Only one class found in y_pred or y_true.")
3221
+
3173
3222
  # Specificity calculation
3174
3223
  validation_scores["specificity"] = (
3175
3224
  tn / (tn + fp) if (tn + fp) > 0 else 0
@@ -3217,3 +3266,349 @@ def cal_metrics(
3217
3266
  )
3218
3267
 
3219
3268
  return validation_scores
3269
+
3270
+ def plot_trees(
3271
+ X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3272
+ ):
3273
+ """
3274
+ # # Example usage:
3275
+ # X = np.random.rand(100, 10) # Example data with 100 samples and 10 features
3276
+ # y = np.random.randint(0, 2, 100) # Example binary target
3277
+ # # Using the function with different classifiers
3278
+ # # Random Forest example
3279
+ # plot_trees(X, y, RandomForestClassifier(), max_trees=100)
3280
+ # # Gradient Boosting with early stopping example
3281
+ # plot_trees(X, y, GradientBoostingClassifier(), max_trees=100, early_stopping_rounds=10)
3282
+ # # Extra Trees example
3283
+ # plot_trees(X, y, ExtraTreesClassifier(), max_trees=100)
3284
+ Master function to plot error rates (OOB, training, and testing) for different tree-based ensemble classifiers.
3285
+
3286
+ Parameters:
3287
+ - X (array-like): Feature matrix.
3288
+ - y (array-like): Target labels.
3289
+ - cls (object): Tree-based ensemble classifier instance (e.g., RandomForestClassifier()).
3290
+ - max_trees (int): Maximum number of trees to evaluate. Default is 500.
3291
+ - test_size (float): Proportion of data to use as test set for testing error. Default is 0.2.
3292
+ - random_state (int): Random state for reproducibility. Default is 42.
3293
+ - early_stopping_rounds (int): For boosting models only, stops training if validation error doesn't improve after specified rounds.
3294
+
3295
+ Returns:
3296
+ - None
3297
+ """
3298
+ from sklearn.model_selection import train_test_split
3299
+ from sklearn.metrics import accuracy_score
3300
+ from sklearn.ensemble import (
3301
+ RandomForestClassifier,
3302
+ BaggingClassifier,
3303
+ ExtraTreesClassifier,
3304
+ )
3305
+ from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
3306
+ # Split data for training and testing error calculation
3307
+ x_train, x_test, y_train, y_test = train_test_split(
3308
+ X, y, test_size=test_size, random_state=random_state
3309
+ )
3310
+
3311
+ # Initialize lists to store error rates
3312
+ oob_error_rate = []
3313
+ train_error_rate = []
3314
+ test_error_rate = []
3315
+ validation_error = None
3316
+
3317
+ # Configure classifier based on type
3318
+ oob_enabled = False # Default to no OOB error unless explicitly set
3319
+
3320
+ if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3321
+ # Enable OOB if cls supports it and is using bootstrapping
3322
+ cls.set_params(warm_start=True, n_estimators=1)
3323
+ if hasattr(cls, "oob_score"):
3324
+ cls.set_params(bootstrap=True, oob_score=True)
3325
+ oob_enabled = True
3326
+ elif isinstance(cls, BaggingClassifier):
3327
+ cls.set_params(warm_start=True, bootstrap=True, oob_score=True, n_estimators=1)
3328
+ oob_enabled = True
3329
+ elif isinstance(cls, (AdaBoostClassifier, GradientBoostingClassifier)):
3330
+ cls.set_params(n_estimators=1)
3331
+ oob_enabled = False
3332
+ if early_stopping_rounds:
3333
+ validation_error = []
3334
+
3335
+ # Train and evaluate with an increasing number of trees
3336
+ for i in range(1, max_trees + 1):
3337
+ cls.set_params(n_estimators=i)
3338
+ cls.fit(x_train, y_train)
3339
+
3340
+ # Calculate OOB error (for models that support it)
3341
+ if oob_enabled and hasattr(cls, "oob_score_") and cls.oob_score:
3342
+ oob_error = 1 - cls.oob_score_
3343
+ oob_error_rate.append(oob_error)
3344
+
3345
+ # Calculate training error
3346
+ train_error = 1 - accuracy_score(y_train, cls.predict(x_train))
3347
+ train_error_rate.append(train_error)
3348
+
3349
+ # Calculate testing error
3350
+ test_error = 1 - accuracy_score(y_test, cls.predict(x_test))
3351
+ test_error_rate.append(test_error)
3352
+
3353
+ # For boosting models, use validation error with early stopping
3354
+ if early_stopping_rounds and isinstance(
3355
+ cls, (AdaBoostClassifier, GradientBoostingClassifier)
3356
+ ):
3357
+ val_error = 1 - accuracy_score(y_test, cls.predict(x_test))
3358
+ validation_error.append(val_error)
3359
+ if len(validation_error) > early_stopping_rounds:
3360
+ # Stop if validation error has not improved in early_stopping_rounds
3361
+ if validation_error[-early_stopping_rounds:] == sorted(
3362
+ validation_error[-early_stopping_rounds:]
3363
+ ):
3364
+ print(f"Early stopping at tree {i} due to lack of improvement in validation error.")
3365
+ break
3366
+
3367
+ # Plot results
3368
+ plt.figure(figsize=(10, 6))
3369
+ if oob_error_rate:
3370
+ plt.plot(
3371
+ range(1, len(oob_error_rate) + 1),
3372
+ oob_error_rate,
3373
+ color="black",
3374
+ label="OOB Error Rate",
3375
+ linewidth=2,
3376
+ )
3377
+ if train_error_rate:
3378
+ plt.plot(
3379
+ range(1, len(train_error_rate) + 1),
3380
+ train_error_rate,
3381
+ linestyle="dotted",
3382
+ color="green",
3383
+ label="Training Error Rate",
3384
+ )
3385
+ if test_error_rate:
3386
+ plt.plot(
3387
+ range(1, len(test_error_rate) + 1),
3388
+ test_error_rate,
3389
+ linestyle="dashed",
3390
+ color="red",
3391
+ label="Testing Error Rate",
3392
+ )
3393
+ if validation_error:
3394
+ plt.plot(
3395
+ range(1, len(validation_error) + 1),
3396
+ validation_error,
3397
+ linestyle="solid",
3398
+ color="blue",
3399
+ label="Validation Error (Boosting)",
3400
+ )
3401
+
3402
+ # Customize plot
3403
+ plt.xlabel("Number of Trees")
3404
+ plt.ylabel("Error Rate")
3405
+ plt.title(f"Error Rate Analysis for {cls.__class__.__name__}")
3406
+ plt.legend(loc="upper right")
3407
+ plt.grid(True)
3408
+ plt.show()
3409
+
3410
+ def img_datasets_preprocessing(
3411
+ data: pd.DataFrame,
3412
+ x_col: str,
3413
+ y_col: str=None,
3414
+ target_size: tuple = (224, 224),
3415
+ batch_size: int = 128,
3416
+ class_mode: str = "raw",
3417
+ shuffle: bool = False,
3418
+ augment: bool = False,
3419
+ scaler: str = 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
3420
+ grayscale: bool = False,
3421
+ encoder: str = "label", # Options: 'label', 'onehot', 'binary'
3422
+ label_encoder=None,
3423
+ kws_augmentation: dict = None,
3424
+ verbose: bool = True,
3425
+ drop_missing: bool = True,
3426
+ output="df", # "iterator":data_iterator,'df':return DataFrame
3427
+ ):
3428
+ """
3429
+ Enhanced preprocessing function for loading and preparing image data from a DataFrame.
3430
+
3431
+ Parameters:
3432
+ - df (pd.DataFrame): Input DataFrame with image paths and labels.
3433
+ - x_col (str): Column in `df` containing image file paths.
3434
+ - y_col (str): Column in `df` containing image labels.
3435
+ - target_size (tuple): Desired image size in (height, width).
3436
+ - batch_size (int): Number of images per batch.
3437
+ - class_mode (str): Mode of label ('raw', 'categorical', 'binary').
3438
+ - shuffle (bool): Shuffle the images in the DataFrame.
3439
+ - augment (bool): Apply data augmentation.
3440
+ - scaler (str): 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
3441
+ - grayscale (bool): Convert images to grayscale.
3442
+ - normalize (bool): Normalize image data to [0, 1] range.
3443
+ - encoder (str): Label encoder method ('label', 'onehot', 'binary').
3444
+ - label_encoder: Optional pre-defined label encoder.
3445
+ - kws_augmentation (dict): Parameters for data augmentation.
3446
+ - verbose (bool): Print status messages.
3447
+ - drop_missing (bool): Drop rows with missing or invalid image paths.
3448
+
3449
+ Returns:
3450
+ - pd.DataFrame: DataFrame with flattened image pixels and 'Label' column.
3451
+ """
3452
+ from tensorflow.keras.preprocessing.image import ImageDataGenerator
3453
+ from tensorflow.keras.utils import to_categorical
3454
+ from sklearn.preprocessing import LabelEncoder
3455
+ from PIL import Image
3456
+ import os
3457
+
3458
+ # Validate input DataFrame for required columns
3459
+ if y_col:
3460
+ assert (
3461
+ x_col in data.columns and y_col in data.columns
3462
+ ), "Missing required columns in DataFrame."
3463
+ if y_col is None:
3464
+ class_mode=None
3465
+ # 输出格式
3466
+ output = ips.strcmp(output,[
3467
+ "generator","tf","iterator","transform","transformer","dataframe",
3468
+ "df","pd","pandas"])[0]
3469
+
3470
+ # Handle missing file paths
3471
+ if drop_missing:
3472
+ data = data[
3473
+ data[x_col].apply(lambda path: os.path.exists(path) and os.path.isfile(path))
3474
+ ]
3475
+
3476
+ # Encoding labels if necessary
3477
+ if encoder and y_col is not None:
3478
+ if encoder == "binary":
3479
+ data[y_col] = (data[y_col] == data[y_col].unique()[0]).astype(int)
3480
+ elif encoder == "onehot":
3481
+ if not label_encoder:
3482
+ label_encoder = LabelEncoder()
3483
+ data[y_col] = label_encoder.fit_transform(data[y_col])
3484
+ data[y_col] = to_categorical(data[y_col])
3485
+ elif encoder == "label":
3486
+ if not label_encoder:
3487
+ label_encoder = LabelEncoder()
3488
+ data[y_col] = label_encoder.fit_transform(data[y_col])
3489
+
3490
+ # Set up data augmentation
3491
+ if augment:
3492
+ aug_params = {
3493
+ "rotation_range": 20,
3494
+ "width_shift_range": 0.2,
3495
+ "height_shift_range": 0.2,
3496
+ "shear_range": 0.2,
3497
+ "zoom_range": 0.2,
3498
+ "horizontal_flip": True,
3499
+ "fill_mode": "nearest",
3500
+ }
3501
+ if kws_augmentation:
3502
+ aug_params.update(kws_augmentation)
3503
+ dat = ImageDataGenerator(rescale=scaler, **aug_params)
3504
+ dat = ImageDataGenerator(
3505
+ rescale=1.0 / 255 if scaler == 'normalize' else None, **aug_params)
3506
+
3507
+ else:
3508
+ dat = ImageDataGenerator(
3509
+ rescale=1.0 / 255 if scaler == 'normalize' else None)
3510
+
3511
+ # Create DataFrameIterator
3512
+ data_iterator = dat.flow_from_dataframe(
3513
+ dataframe=data,
3514
+ x_col=x_col,
3515
+ y_col=y_col,
3516
+ target_size=target_size,
3517
+ color_mode="grayscale" if grayscale else "rgb",
3518
+ batch_size=batch_size,
3519
+ class_mode=class_mode,
3520
+ shuffle=shuffle,
3521
+ )
3522
+ print(f"target_size:{target_size}")
3523
+ if output.lower() in ["generator", "tf", "iterator", "transform", "transformer"]:
3524
+ return data_iterator
3525
+ elif output.lower() in ["dataframe", "df", "pd", "pandas"]:
3526
+ # Initialize list to collect processed data
3527
+ data_list = []
3528
+ total_batches = data_iterator.n // batch_size
3529
+
3530
+ # Load, resize, and process images in batches
3531
+ for i, (batch_images, batch_labels) in enumerate(data_iterator):
3532
+ for img, label in zip(batch_images, batch_labels):
3533
+ if scaler == ['normalize','raw']:
3534
+ # Already rescaled by 1.0/255 in ImageDataGenerator
3535
+ pass
3536
+ elif scaler == 'standardize':
3537
+ # Standardize by subtracting mean and dividing by std
3538
+ img = (img - np.mean(img)) / np.std(img)
3539
+ elif scaler == 'clahe':
3540
+ # Apply CLAHE to the image
3541
+ img = apply_clahe(img)
3542
+ flat_img = img.flatten()
3543
+ data_list.append(np.append(flat_img, label))
3544
+
3545
+ # Stop when all images have been processed
3546
+ if i >= total_batches:
3547
+ break
3548
+
3549
+ # Define column names for flattened image data
3550
+ pixel_count = target_size[0] * target_size[1] * (1 if grayscale else 3)
3551
+ column_names = [f"pixel_{i}" for i in range(pixel_count)] + ["Label"]
3552
+
3553
+ # Create DataFrame from flattened data
3554
+ df_img = pd.DataFrame(data_list, columns=column_names)
3555
+
3556
+ if verbose:
3557
+ print("Processed images:", len(df_img))
3558
+ print("Final DataFrame shape:", df_img.shape)
3559
+ display(df_img.head())
3560
+
3561
+ return df_img
3562
+
3563
+
3564
+ def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_out=0.05, verbose=True):
3565
+ """
3566
+ # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
3567
+
3568
+ Evaluates the p-values of all features, which represent the probability of observing a coefficient
3569
+ as extreme as the one calculated if the feature had no true effect on the target.
3570
+
3571
+ Args:
3572
+ X -- features values
3573
+ y -- target variable
3574
+ initial_list -- features header
3575
+ threshold_out -- pvalue threshold of features to drop
3576
+ verbose -- true to produce lots of logging output
3577
+
3578
+ Returns:
3579
+ list of selected features for modeling
3580
+ """
3581
+ import statsmodels.api as sm
3582
+ if isinstance(y, str) and y in X.columns:
3583
+ y_col_name = y
3584
+ y = X[y]
3585
+ X = X.drop(y_col_name, axis=1)
3586
+ included = list(X.columns)
3587
+ while True:
3588
+ changed = False
3589
+ model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3590
+ # exclude the intercept for p-value checking
3591
+ pvalues = model.pvalues.iloc[1:]
3592
+ worst_pval = pvalues.max()
3593
+ if worst_pval > threshold_out:
3594
+ changed = True
3595
+ worst_feature = pvalues.idxmax()
3596
+ included.remove(worst_feature)
3597
+ if verbose:
3598
+ print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
3599
+ if not changed:
3600
+ break
3601
+ print(f"\nSelected Features:\n{included}")
3602
+ return included # Returns the list of selected features
3603
+
3604
+
3605
+ # Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
3606
+ def apply_clahe(img):
3607
+ import cv2
3608
+ lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
3609
+ l, a, b = cv2.split(lab) # Split into channels
3610
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
3611
+ cl = clahe.apply(l) # Apply CLAHE to the L channel
3612
+ limg = cv2.merge((cl, a, b)) # Merge back the channels
3613
+ img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
3614
+ return img_clahe