py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -506,7 +506,7 @@ def get_models(
506
506
  "Support Vector Machine(svm)",
507
507
  "naive bayes",
508
508
  "Linear Discriminant Analysis (lda)",
509
- "adaboost",
509
+ "AdaBoost",
510
510
  "DecisionTree",
511
511
  "KNeighbors",
512
512
  "Bagging",
@@ -585,7 +585,7 @@ def get_features(
585
585
  "Support Vector Machine(svm)",
586
586
  "naive bayes",
587
587
  "Linear Discriminant Analysis (lda)",
588
- "adaboost",
588
+ "AdaBoost",
589
589
  "DecisionTree",
590
590
  "KNeighbors",
591
591
  "Bagging",
@@ -699,9 +699,11 @@ def get_features(
699
699
  "Support Vector Machine(svm)",
700
700
  "Naive Bayes",
701
701
  "Linear Discriminant Analysis (lda)",
702
- "adaboost",
702
+ "AdaBoost",
703
703
  ]
704
704
  cls = [ips.strcmp(i, cls_)[0] for i in cls]
705
+
706
+ feature_importances = {}
705
707
 
706
708
  # Lasso Feature Selection
707
709
  lasso_importances = (
@@ -712,6 +714,7 @@ def get_features(
712
714
  lasso_selected_features = (
713
715
  lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
714
716
  )
717
+ feature_importances['lasso']=lasso_importances.head(n_features)
715
718
  # Ridge
716
719
  ridge_importances = (
717
720
  features_ridge(x_train, y_train, ridge_params)
@@ -721,6 +724,7 @@ def get_features(
721
724
  selected_ridge_features = (
722
725
  ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
723
726
  )
727
+ feature_importances['ridge']=ridge_importances.head(n_features)
724
728
  # Elastic Net
725
729
  enet_importances = (
726
730
  features_enet(x_train, y_train, enet_params)
@@ -730,6 +734,7 @@ def get_features(
730
734
  selected_enet_features = (
731
735
  enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
732
736
  )
737
+ feature_importances['Enet']=enet_importances.head(n_features)
733
738
  # Random Forest Feature Importance
734
739
  rf_importances = (
735
740
  features_rf(x_train, y_train, rf_params)
@@ -741,6 +746,7 @@ def get_features(
741
746
  if "Random Forest" in cls
742
747
  else []
743
748
  )
749
+ feature_importances['Random Forest']=rf_importances.head(n_features)
744
750
  # Gradient Boosting Feature Importance
745
751
  gb_importances = (
746
752
  features_gradient_boosting(x_train, y_train, gb_params)
@@ -752,6 +758,7 @@ def get_features(
752
758
  if "Gradient Boosting" in cls
753
759
  else []
754
760
  )
761
+ feature_importances['Gradient Boosting']=gb_importances.head(n_features)
755
762
  # xgb
756
763
  xgb_importances = (
757
764
  features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
@@ -759,6 +766,7 @@ def get_features(
759
766
  top_xgb_features = (
760
767
  xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
761
768
  )
769
+ feature_importances['xgb']=xgb_importances.head(n_features)
762
770
 
763
771
  # SVM with RFE
764
772
  selected_svm_features = (
@@ -773,6 +781,7 @@ def get_features(
773
781
  selected_lda_features = (
774
782
  lda_importances.head(n_features)["feature"].values if "lda" in cls else []
775
783
  )
784
+ feature_importances['lda']=lda_importances.head(n_features)
776
785
  # AdaBoost Feature Importance
777
786
  adaboost_importances = (
778
787
  features_adaboost(x_train, y_train, adaboost_params)
@@ -784,6 +793,7 @@ def get_features(
784
793
  if "AdaBoost" in cls
785
794
  else []
786
795
  )
796
+ feature_importances['AdaBoost']=adaboost_importances.head(n_features)
787
797
  # Decision Tree Feature Importance
788
798
  dt_importances = (
789
799
  features_decision_tree(x_train, y_train, dt_params)
@@ -794,7 +804,8 @@ def get_features(
794
804
  dt_importances.head(n_features)["feature"].values
795
805
  if "Decision Tree" in cls
796
806
  else []
797
- )
807
+ )
808
+ feature_importances['Decision Tree']=dt_importances.head(n_features)
798
809
  # Bagging Feature Importance
799
810
  bagging_importances = (
800
811
  features_bagging(x_train, y_train, bagging_params)
@@ -806,6 +817,7 @@ def get_features(
806
817
  if "Bagging" in cls
807
818
  else []
808
819
  )
820
+ feature_importances['Bagging']=bagging_importances.head(n_features)
809
821
  # KNN Feature Importance via Permutation
810
822
  knn_importances = (
811
823
  features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
@@ -813,6 +825,7 @@ def get_features(
813
825
  top_knn_features = (
814
826
  knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
815
827
  )
828
+ feature_importances['KNN']=knn_importances.head(n_features)
816
829
 
817
830
  #! Find common features
818
831
  common_features = ips.shared(
@@ -915,6 +928,7 @@ def get_features(
915
928
  "cv_train_scores": cv_train_results_df,
916
929
  "cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
917
930
  "common_features": list(common_features),
931
+ "feature_importances":feature_importances
918
932
  }
919
933
  if all([plot_, dir_save]):
920
934
  from datetime import datetime
@@ -927,6 +941,7 @@ def get_features(
927
941
  "cv_train_scores": pd.DataFrame(),
928
942
  "cv_test_scores": pd.DataFrame(),
929
943
  "common_features": [],
944
+ "feature_importances":{}
930
945
  }
931
946
  print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
932
947
  return results
@@ -2033,6 +2048,7 @@ def predict(
2033
2048
  y_train: pd.Series,
2034
2049
  x_true: pd.DataFrame = None,
2035
2050
  y_true: Optional[pd.Series] = None,
2051
+ backward:bool=False, # backward_regression
2036
2052
  common_features: set = None,
2037
2053
  purpose: str = "classification", # 'classification' or 'regression'
2038
2054
  cls: Optional[Dict[str, Any]] = None,
@@ -2227,11 +2243,21 @@ def predict(
2227
2243
  # else:
2228
2244
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2229
2245
  y_train=pd.DataFrame(y_train)
2230
- y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
2231
- is_binary = False if y_train_.shape[1] >2 else True
2232
-
2233
- # if is_binary:
2234
- # y_train = ips.df_encoder(pd.DataFrame(y_train), method="label").values.ravel()
2246
+ if y_train.select_dtypes(include=np.number).empty:
2247
+ y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
2248
+ is_binary = False if y_train_.shape[1] >2 else True
2249
+ else:
2250
+ y_train_=ips.flatten(y_train.values)
2251
+ is_binary = False if len(y_train_)>2 else True
2252
+
2253
+ if is_binary:
2254
+ y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2255
+ print('is_binary:',is_binary)
2256
+
2257
+ # Perform backward feature selection
2258
+ if backward:
2259
+ selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
2260
+ x_train=x_train[selected_features]
2235
2261
 
2236
2262
  if x_true is None:
2237
2263
  x_train, x_true, y_train, y_true = train_test_split(
@@ -2267,10 +2293,12 @@ def predict(
2267
2293
 
2268
2294
  # y_train=y_train.values.ravel() if y_train is not None else None
2269
2295
  # y_true=y_true.values.ravel() if y_true is not None else None
2270
- y_train = (
2271
- y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
2272
- )
2273
- y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
2296
+ if y_train is not None:
2297
+ y_train = (
2298
+ y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
2299
+ )
2300
+ if y_true is not None:
2301
+ y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
2274
2302
  # Ensure common features are selected
2275
2303
  if common_features is not None:
2276
2304
  x_train, x_true = x_train[common_features], x_true[common_features]
@@ -2893,7 +2921,11 @@ def predict(
2893
2921
  x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2894
2922
  y_pred = best_clf.predict(x_true)
2895
2923
  if hasattr(best_clf, "predict_proba"):
2896
- y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
2924
+ y_pred_proba = best_clf.predict_proba(x_true)
2925
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
2926
+ if y_pred_proba.shape[1] == 1:
2927
+ y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba]) # Add missing class probabilities
2928
+ y_pred_proba = y_pred_proba[:, 1]
2897
2929
  elif hasattr(best_clf, "decision_function"):
2898
2930
  # If predict_proba is not available, use decision_function (e.g., for SVM)
2899
2931
  y_pred_proba = best_clf.decision_function(x_true)
@@ -3048,6 +3080,14 @@ def predict(
3048
3080
  }
3049
3081
 
3050
3082
  else:
3083
+ validation_scores = cal_metrics(
3084
+ y_true,
3085
+ y_pred,
3086
+ y_pred_proba=y_pred_proba,
3087
+ is_binary=is_binary,
3088
+ purpose=purpose,
3089
+ average="weighted",
3090
+ )
3051
3091
  results[name] = {
3052
3092
  "best_clf": gs.best_estimator_,
3053
3093
  "best_params": gs.best_params_,
@@ -3056,6 +3096,8 @@ def predict(
3056
3096
  "predictions_proba": (
3057
3097
  y_pred_proba.tolist() if y_pred_proba is not None else None
3058
3098
  ),
3099
+ "y_train":y_train if y_train is not None else [],
3100
+ "y_true": y_true if y_true is not None else []
3059
3101
  }
3060
3102
 
3061
3103
  # Convert results to DataFrame
@@ -3078,7 +3120,7 @@ def predict(
3078
3120
  ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
3079
3121
 
3080
3122
  df_scores=df_scores.select_dtypes(include=np.number)
3081
- display(df_scores)
3123
+
3082
3124
  if df_scores.shape[0] > 1: # draw cluster
3083
3125
  plot.heatmap(df_scores, kind="direct", cluster=True)
3084
3126
  plot.figsets(xangle=30)
@@ -3169,7 +3211,14 @@ def cal_metrics(
3169
3211
 
3170
3212
  # Confusion matrix to calculate specificity
3171
3213
  if is_binary:
3172
- tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
3214
+ cm = confusion_matrix(y_true, y_pred)
3215
+ if cm.size == 4:
3216
+ tn, fp, fn, tp = cm.ravel()
3217
+ else:
3218
+ # Handle single-class predictions
3219
+ tn, fp, fn, tp = 0, 0, 0, 0
3220
+ print("Warning: Only one class found in y_pred or y_true.")
3221
+
3173
3222
  # Specificity calculation
3174
3223
  validation_scores["specificity"] = (
3175
3224
  tn / (tn + fp) if (tn + fp) > 0 else 0
@@ -3217,3 +3266,349 @@ def cal_metrics(
3217
3266
  )
3218
3267
 
3219
3268
  return validation_scores
3269
+
3270
+ def plot_trees(
3271
+ X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3272
+ ):
3273
+ """
3274
+ # # Example usage:
3275
+ # X = np.random.rand(100, 10) # Example data with 100 samples and 10 features
3276
+ # y = np.random.randint(0, 2, 100) # Example binary target
3277
+ # # Using the function with different classifiers
3278
+ # # Random Forest example
3279
+ # plot_trees(X, y, RandomForestClassifier(), max_trees=100)
3280
+ # # Gradient Boosting with early stopping example
3281
+ # plot_trees(X, y, GradientBoostingClassifier(), max_trees=100, early_stopping_rounds=10)
3282
+ # # Extra Trees example
3283
+ # plot_trees(X, y, ExtraTreesClassifier(), max_trees=100)
3284
+ Master function to plot error rates (OOB, training, and testing) for different tree-based ensemble classifiers.
3285
+
3286
+ Parameters:
3287
+ - X (array-like): Feature matrix.
3288
+ - y (array-like): Target labels.
3289
+ - cls (object): Tree-based ensemble classifier instance (e.g., RandomForestClassifier()).
3290
+ - max_trees (int): Maximum number of trees to evaluate. Default is 500.
3291
+ - test_size (float): Proportion of data to use as test set for testing error. Default is 0.2.
3292
+ - random_state (int): Random state for reproducibility. Default is 42.
3293
+ - early_stopping_rounds (int): For boosting models only, stops training if validation error doesn't improve after specified rounds.
3294
+
3295
+ Returns:
3296
+ - None
3297
+ """
3298
+ from sklearn.model_selection import train_test_split
3299
+ from sklearn.metrics import accuracy_score
3300
+ from sklearn.ensemble import (
3301
+ RandomForestClassifier,
3302
+ BaggingClassifier,
3303
+ ExtraTreesClassifier,
3304
+ )
3305
+ from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
3306
+ # Split data for training and testing error calculation
3307
+ x_train, x_test, y_train, y_test = train_test_split(
3308
+ X, y, test_size=test_size, random_state=random_state
3309
+ )
3310
+
3311
+ # Initialize lists to store error rates
3312
+ oob_error_rate = []
3313
+ train_error_rate = []
3314
+ test_error_rate = []
3315
+ validation_error = None
3316
+
3317
+ # Configure classifier based on type
3318
+ oob_enabled = False # Default to no OOB error unless explicitly set
3319
+
3320
+ if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3321
+ # Enable OOB if cls supports it and is using bootstrapping
3322
+ cls.set_params(warm_start=True, n_estimators=1)
3323
+ if hasattr(cls, "oob_score"):
3324
+ cls.set_params(bootstrap=True, oob_score=True)
3325
+ oob_enabled = True
3326
+ elif isinstance(cls, BaggingClassifier):
3327
+ cls.set_params(warm_start=True, bootstrap=True, oob_score=True, n_estimators=1)
3328
+ oob_enabled = True
3329
+ elif isinstance(cls, (AdaBoostClassifier, GradientBoostingClassifier)):
3330
+ cls.set_params(n_estimators=1)
3331
+ oob_enabled = False
3332
+ if early_stopping_rounds:
3333
+ validation_error = []
3334
+
3335
+ # Train and evaluate with an increasing number of trees
3336
+ for i in range(1, max_trees + 1):
3337
+ cls.set_params(n_estimators=i)
3338
+ cls.fit(x_train, y_train)
3339
+
3340
+ # Calculate OOB error (for models that support it)
3341
+ if oob_enabled and hasattr(cls, "oob_score_") and cls.oob_score:
3342
+ oob_error = 1 - cls.oob_score_
3343
+ oob_error_rate.append(oob_error)
3344
+
3345
+ # Calculate training error
3346
+ train_error = 1 - accuracy_score(y_train, cls.predict(x_train))
3347
+ train_error_rate.append(train_error)
3348
+
3349
+ # Calculate testing error
3350
+ test_error = 1 - accuracy_score(y_test, cls.predict(x_test))
3351
+ test_error_rate.append(test_error)
3352
+
3353
+ # For boosting models, use validation error with early stopping
3354
+ if early_stopping_rounds and isinstance(
3355
+ cls, (AdaBoostClassifier, GradientBoostingClassifier)
3356
+ ):
3357
+ val_error = 1 - accuracy_score(y_test, cls.predict(x_test))
3358
+ validation_error.append(val_error)
3359
+ if len(validation_error) > early_stopping_rounds:
3360
+ # Stop if validation error has not improved in early_stopping_rounds
3361
+ if validation_error[-early_stopping_rounds:] == sorted(
3362
+ validation_error[-early_stopping_rounds:]
3363
+ ):
3364
+ print(f"Early stopping at tree {i} due to lack of improvement in validation error.")
3365
+ break
3366
+
3367
+ # Plot results
3368
+ plt.figure(figsize=(10, 6))
3369
+ if oob_error_rate:
3370
+ plt.plot(
3371
+ range(1, len(oob_error_rate) + 1),
3372
+ oob_error_rate,
3373
+ color="black",
3374
+ label="OOB Error Rate",
3375
+ linewidth=2,
3376
+ )
3377
+ if train_error_rate:
3378
+ plt.plot(
3379
+ range(1, len(train_error_rate) + 1),
3380
+ train_error_rate,
3381
+ linestyle="dotted",
3382
+ color="green",
3383
+ label="Training Error Rate",
3384
+ )
3385
+ if test_error_rate:
3386
+ plt.plot(
3387
+ range(1, len(test_error_rate) + 1),
3388
+ test_error_rate,
3389
+ linestyle="dashed",
3390
+ color="red",
3391
+ label="Testing Error Rate",
3392
+ )
3393
+ if validation_error:
3394
+ plt.plot(
3395
+ range(1, len(validation_error) + 1),
3396
+ validation_error,
3397
+ linestyle="solid",
3398
+ color="blue",
3399
+ label="Validation Error (Boosting)",
3400
+ )
3401
+
3402
+ # Customize plot
3403
+ plt.xlabel("Number of Trees")
3404
+ plt.ylabel("Error Rate")
3405
+ plt.title(f"Error Rate Analysis for {cls.__class__.__name__}")
3406
+ plt.legend(loc="upper right")
3407
+ plt.grid(True)
3408
+ plt.show()
3409
+
3410
+ def img_datasets_preprocessing(
3411
+ data: pd.DataFrame,
3412
+ x_col: str,
3413
+ y_col: str=None,
3414
+ target_size: tuple = (224, 224),
3415
+ batch_size: int = 128,
3416
+ class_mode: str = "raw",
3417
+ shuffle: bool = False,
3418
+ augment: bool = False,
3419
+ scaler: str = 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
3420
+ grayscale: bool = False,
3421
+ encoder: str = "label", # Options: 'label', 'onehot', 'binary'
3422
+ label_encoder=None,
3423
+ kws_augmentation: dict = None,
3424
+ verbose: bool = True,
3425
+ drop_missing: bool = True,
3426
+ output="df", # "iterator":data_iterator,'df':return DataFrame
3427
+ ):
3428
+ """
3429
+ Enhanced preprocessing function for loading and preparing image data from a DataFrame.
3430
+
3431
+ Parameters:
3432
+ - df (pd.DataFrame): Input DataFrame with image paths and labels.
3433
+ - x_col (str): Column in `df` containing image file paths.
3434
+ - y_col (str): Column in `df` containing image labels.
3435
+ - target_size (tuple): Desired image size in (height, width).
3436
+ - batch_size (int): Number of images per batch.
3437
+ - class_mode (str): Mode of label ('raw', 'categorical', 'binary').
3438
+ - shuffle (bool): Shuffle the images in the DataFrame.
3439
+ - augment (bool): Apply data augmentation.
3440
+ - scaler (str): 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
3441
+ - grayscale (bool): Convert images to grayscale.
3442
+ - normalize (bool): Normalize image data to [0, 1] range.
3443
+ - encoder (str): Label encoder method ('label', 'onehot', 'binary').
3444
+ - label_encoder: Optional pre-defined label encoder.
3445
+ - kws_augmentation (dict): Parameters for data augmentation.
3446
+ - verbose (bool): Print status messages.
3447
+ - drop_missing (bool): Drop rows with missing or invalid image paths.
3448
+
3449
+ Returns:
3450
+ - pd.DataFrame: DataFrame with flattened image pixels and 'Label' column.
3451
+ """
3452
+ from tensorflow.keras.preprocessing.image import ImageDataGenerator
3453
+ from tensorflow.keras.utils import to_categorical
3454
+ from sklearn.preprocessing import LabelEncoder
3455
+ from PIL import Image
3456
+ import os
3457
+
3458
+ # Validate input DataFrame for required columns
3459
+ if y_col:
3460
+ assert (
3461
+ x_col in data.columns and y_col in data.columns
3462
+ ), "Missing required columns in DataFrame."
3463
+ if y_col is None:
3464
+ class_mode=None
3465
+ # 输出格式
3466
+ output = ips.strcmp(output,[
3467
+ "generator","tf","iterator","transform","transformer","dataframe",
3468
+ "df","pd","pandas"])[0]
3469
+
3470
+ # Handle missing file paths
3471
+ if drop_missing:
3472
+ data = data[
3473
+ data[x_col].apply(lambda path: os.path.exists(path) and os.path.isfile(path))
3474
+ ]
3475
+
3476
+ # Encoding labels if necessary
3477
+ if encoder and y_col is not None:
3478
+ if encoder == "binary":
3479
+ data[y_col] = (data[y_col] == data[y_col].unique()[0]).astype(int)
3480
+ elif encoder == "onehot":
3481
+ if not label_encoder:
3482
+ label_encoder = LabelEncoder()
3483
+ data[y_col] = label_encoder.fit_transform(data[y_col])
3484
+ data[y_col] = to_categorical(data[y_col])
3485
+ elif encoder == "label":
3486
+ if not label_encoder:
3487
+ label_encoder = LabelEncoder()
3488
+ data[y_col] = label_encoder.fit_transform(data[y_col])
3489
+
3490
+ # Set up data augmentation
3491
+ if augment:
3492
+ aug_params = {
3493
+ "rotation_range": 20,
3494
+ "width_shift_range": 0.2,
3495
+ "height_shift_range": 0.2,
3496
+ "shear_range": 0.2,
3497
+ "zoom_range": 0.2,
3498
+ "horizontal_flip": True,
3499
+ "fill_mode": "nearest",
3500
+ }
3501
+ if kws_augmentation:
3502
+ aug_params.update(kws_augmentation)
3503
+ dat = ImageDataGenerator(rescale=scaler, **aug_params)
3504
+ dat = ImageDataGenerator(
3505
+ rescale=1.0 / 255 if scaler == 'normalize' else None, **aug_params)
3506
+
3507
+ else:
3508
+ dat = ImageDataGenerator(
3509
+ rescale=1.0 / 255 if scaler == 'normalize' else None)
3510
+
3511
+ # Create DataFrameIterator
3512
+ data_iterator = dat.flow_from_dataframe(
3513
+ dataframe=data,
3514
+ x_col=x_col,
3515
+ y_col=y_col,
3516
+ target_size=target_size,
3517
+ color_mode="grayscale" if grayscale else "rgb",
3518
+ batch_size=batch_size,
3519
+ class_mode=class_mode,
3520
+ shuffle=shuffle,
3521
+ )
3522
+ print(f"target_size:{target_size}")
3523
+ if output.lower() in ["generator", "tf", "iterator", "transform", "transformer"]:
3524
+ return data_iterator
3525
+ elif output.lower() in ["dataframe", "df", "pd", "pandas"]:
3526
+ # Initialize list to collect processed data
3527
+ data_list = []
3528
+ total_batches = data_iterator.n // batch_size
3529
+
3530
+ # Load, resize, and process images in batches
3531
+ for i, (batch_images, batch_labels) in enumerate(data_iterator):
3532
+ for img, label in zip(batch_images, batch_labels):
3533
+ if scaler == ['normalize','raw']:
3534
+ # Already rescaled by 1.0/255 in ImageDataGenerator
3535
+ pass
3536
+ elif scaler == 'standardize':
3537
+ # Standardize by subtracting mean and dividing by std
3538
+ img = (img - np.mean(img)) / np.std(img)
3539
+ elif scaler == 'clahe':
3540
+ # Apply CLAHE to the image
3541
+ img = apply_clahe(img)
3542
+ flat_img = img.flatten()
3543
+ data_list.append(np.append(flat_img, label))
3544
+
3545
+ # Stop when all images have been processed
3546
+ if i >= total_batches:
3547
+ break
3548
+
3549
+ # Define column names for flattened image data
3550
+ pixel_count = target_size[0] * target_size[1] * (1 if grayscale else 3)
3551
+ column_names = [f"pixel_{i}" for i in range(pixel_count)] + ["Label"]
3552
+
3553
+ # Create DataFrame from flattened data
3554
+ df_img = pd.DataFrame(data_list, columns=column_names)
3555
+
3556
+ if verbose:
3557
+ print("Processed images:", len(df_img))
3558
+ print("Final DataFrame shape:", df_img.shape)
3559
+ display(df_img.head())
3560
+
3561
+ return df_img
3562
+
3563
+
3564
+ def backward_regression(X:pd.DataFrame, y:pd.Series, initial_list=[], threshold_out=0.05, verbose=True):
3565
+ """
3566
+ # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
3567
+
3568
+ Evaluates the p-values of all features, which represent the probability of observing a coefficient
3569
+ as extreme as the one calculated if the feature had no true effect on the target.
3570
+
3571
+ Args:
3572
+ X -- features values
3573
+ y -- target variable
3574
+ initial_list -- features header
3575
+ threshold_out -- pvalue threshold of features to drop
3576
+ verbose -- true to produce lots of logging output
3577
+
3578
+ Returns:
3579
+ list of selected features for modeling
3580
+ """
3581
+ import statsmodels.api as sm
3582
+ if isinstance(y, str) and y in X.columns:
3583
+ y_col_name = y
3584
+ y = X[y]
3585
+ X = X.drop(y_col_name, axis=1)
3586
+ included = list(X.columns)
3587
+ while True:
3588
+ changed = False
3589
+ model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
3590
+ # exclude the intercept for p-value checking
3591
+ pvalues = model.pvalues.iloc[1:]
3592
+ worst_pval = pvalues.max()
3593
+ if worst_pval > threshold_out:
3594
+ changed = True
3595
+ worst_feature = pvalues.idxmax()
3596
+ included.remove(worst_feature)
3597
+ if verbose:
3598
+ print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
3599
+ if not changed:
3600
+ break
3601
+ print(f"\nSelected Features:\n{included}")
3602
+ return included # Returns the list of selected features
3603
+
3604
+
3605
+ # Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
3606
+ def apply_clahe(img):
3607
+ import cv2
3608
+ lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
3609
+ l, a, b = cv2.split(lab) # Split into channels
3610
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
3611
+ cl = clahe.apply(l) # Apply CLAHE to the L channel
3612
+ limg = cv2.merge((cl, a, b)) # Merge back the channels
3613
+ img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
3614
+ return img_clahe