py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ml2ls.py CHANGED
@@ -506,7 +506,7 @@ def get_models(
506
506
  "Support Vector Machine(svm)",
507
507
  "naive bayes",
508
508
  "Linear Discriminant Analysis (lda)",
509
- "adaboost",
509
+ "AdaBoost",
510
510
  "DecisionTree",
511
511
  "KNeighbors",
512
512
  "Bagging",
@@ -585,7 +585,7 @@ def get_features(
585
585
  "Support Vector Machine(svm)",
586
586
  "naive bayes",
587
587
  "Linear Discriminant Analysis (lda)",
588
- "adaboost",
588
+ "AdaBoost",
589
589
  "DecisionTree",
590
590
  "KNeighbors",
591
591
  "Bagging",
@@ -699,9 +699,11 @@ def get_features(
699
699
  "Support Vector Machine(svm)",
700
700
  "Naive Bayes",
701
701
  "Linear Discriminant Analysis (lda)",
702
- "adaboost",
702
+ "AdaBoost",
703
703
  ]
704
704
  cls = [ips.strcmp(i, cls_)[0] for i in cls]
705
+
706
+ feature_importances = {}
705
707
 
706
708
  # Lasso Feature Selection
707
709
  lasso_importances = (
@@ -712,6 +714,7 @@ def get_features(
712
714
  lasso_selected_features = (
713
715
  lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
714
716
  )
717
+ feature_importances['lasso']=lasso_importances.head(n_features)
715
718
  # Ridge
716
719
  ridge_importances = (
717
720
  features_ridge(x_train, y_train, ridge_params)
@@ -721,6 +724,7 @@ def get_features(
721
724
  selected_ridge_features = (
722
725
  ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
723
726
  )
727
+ feature_importances['ridge']=ridge_importances.head(n_features)
724
728
  # Elastic Net
725
729
  enet_importances = (
726
730
  features_enet(x_train, y_train, enet_params)
@@ -730,6 +734,7 @@ def get_features(
730
734
  selected_enet_features = (
731
735
  enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
732
736
  )
737
+ feature_importances['Enet']=enet_importances.head(n_features)
733
738
  # Random Forest Feature Importance
734
739
  rf_importances = (
735
740
  features_rf(x_train, y_train, rf_params)
@@ -741,6 +746,7 @@ def get_features(
741
746
  if "Random Forest" in cls
742
747
  else []
743
748
  )
749
+ feature_importances['Random Forest']=rf_importances.head(n_features)
744
750
  # Gradient Boosting Feature Importance
745
751
  gb_importances = (
746
752
  features_gradient_boosting(x_train, y_train, gb_params)
@@ -752,6 +758,7 @@ def get_features(
752
758
  if "Gradient Boosting" in cls
753
759
  else []
754
760
  )
761
+ feature_importances['Gradient Boosting']=gb_importances.head(n_features)
755
762
  # xgb
756
763
  xgb_importances = (
757
764
  features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
@@ -759,6 +766,7 @@ def get_features(
759
766
  top_xgb_features = (
760
767
  xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
761
768
  )
769
+ feature_importances['xgb']=xgb_importances.head(n_features)
762
770
 
763
771
  # SVM with RFE
764
772
  selected_svm_features = (
@@ -773,6 +781,7 @@ def get_features(
773
781
  selected_lda_features = (
774
782
  lda_importances.head(n_features)["feature"].values if "lda" in cls else []
775
783
  )
784
+ feature_importances['lda']=lda_importances.head(n_features)
776
785
  # AdaBoost Feature Importance
777
786
  adaboost_importances = (
778
787
  features_adaboost(x_train, y_train, adaboost_params)
@@ -784,6 +793,7 @@ def get_features(
784
793
  if "AdaBoost" in cls
785
794
  else []
786
795
  )
796
+ feature_importances['AdaBoost']=adaboost_importances.head(n_features)
787
797
  # Decision Tree Feature Importance
788
798
  dt_importances = (
789
799
  features_decision_tree(x_train, y_train, dt_params)
@@ -794,7 +804,8 @@ def get_features(
794
804
  dt_importances.head(n_features)["feature"].values
795
805
  if "Decision Tree" in cls
796
806
  else []
797
- )
807
+ )
808
+ feature_importances['Decision Tree']=dt_importances.head(n_features)
798
809
  # Bagging Feature Importance
799
810
  bagging_importances = (
800
811
  features_bagging(x_train, y_train, bagging_params)
@@ -806,6 +817,7 @@ def get_features(
806
817
  if "Bagging" in cls
807
818
  else []
808
819
  )
820
+ feature_importances['Bagging']=bagging_importances.head(n_features)
809
821
  # KNN Feature Importance via Permutation
810
822
  knn_importances = (
811
823
  features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
@@ -813,6 +825,7 @@ def get_features(
813
825
  top_knn_features = (
814
826
  knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
815
827
  )
828
+ feature_importances['KNN']=knn_importances.head(n_features)
816
829
 
817
830
  #! Find common features
818
831
  common_features = ips.shared(
@@ -915,6 +928,7 @@ def get_features(
915
928
  "cv_train_scores": cv_train_results_df,
916
929
  "cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
917
930
  "common_features": list(common_features),
931
+ "feature_importances":feature_importances
918
932
  }
919
933
  if all([plot_, dir_save]):
920
934
  from datetime import datetime
@@ -927,6 +941,7 @@ def get_features(
927
941
  "cv_train_scores": pd.DataFrame(),
928
942
  "cv_test_scores": pd.DataFrame(),
929
943
  "common_features": [],
944
+ "feature_importances":{}
930
945
  }
931
946
  print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
932
947
  return results
@@ -2227,12 +2242,16 @@ def predict(
2227
2242
  # else:
2228
2243
  # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
2229
2244
  y_train=pd.DataFrame(y_train)
2230
- y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
2231
- is_binary = False if y_train_.shape[1] >2 else True
2232
-
2233
- # if is_binary:
2234
- # y_train = ips.df_encoder(pd.DataFrame(y_train), method="label").values.ravel()
2245
+ if y_train.select_dtypes(include=np.number).empty:
2246
+ y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
2247
+ is_binary = False if y_train_.shape[1] >2 else True
2248
+ else:
2249
+ y_train_=ips.flatten(y_train.values)
2250
+ is_binary = False if len(y_train_)>2 else True
2235
2251
 
2252
+ if is_binary:
2253
+ y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
2254
+ print('is_binary:',is_binary)
2236
2255
  if x_true is None:
2237
2256
  x_train, x_true, y_train, y_true = train_test_split(
2238
2257
  x_train,
@@ -2893,7 +2912,11 @@ def predict(
2893
2912
  x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
2894
2913
  y_pred = best_clf.predict(x_true)
2895
2914
  if hasattr(best_clf, "predict_proba"):
2896
- y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
2915
+ y_pred_proba = best_clf.predict_proba(x_true)
2916
+ print("Shape of predicted probabilities:", y_pred_proba.shape)
2917
+ if y_pred_proba.shape[1] == 1:
2918
+ y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba]) # Add missing class probabilities
2919
+ y_pred_proba = y_pred_proba[:, 1]
2897
2920
  elif hasattr(best_clf, "decision_function"):
2898
2921
  # If predict_proba is not available, use decision_function (e.g., for SVM)
2899
2922
  y_pred_proba = best_clf.decision_function(x_true)
@@ -3078,7 +3101,7 @@ def predict(
3078
3101
  ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
3079
3102
 
3080
3103
  df_scores=df_scores.select_dtypes(include=np.number)
3081
- display(df_scores)
3104
+
3082
3105
  if df_scores.shape[0] > 1: # draw cluster
3083
3106
  plot.heatmap(df_scores, kind="direct", cluster=True)
3084
3107
  plot.figsets(xangle=30)
@@ -3169,7 +3192,14 @@ def cal_metrics(
3169
3192
 
3170
3193
  # Confusion matrix to calculate specificity
3171
3194
  if is_binary:
3172
- tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
3195
+ cm = confusion_matrix(y_true, y_pred)
3196
+ if cm.size == 4:
3197
+ tn, fp, fn, tp = cm.ravel()
3198
+ else:
3199
+ # Handle single-class predictions
3200
+ tn, fp, fn, tp = 0, 0, 0, 0
3201
+ print("Warning: Only one class found in y_pred or y_true.")
3202
+
3173
3203
  # Specificity calculation
3174
3204
  validation_scores["specificity"] = (
3175
3205
  tn / (tn + fp) if (tn + fp) > 0 else 0
@@ -3217,3 +3247,306 @@ def cal_metrics(
3217
3247
  )
3218
3248
 
3219
3249
  return validation_scores
3250
+
3251
+ def plot_trees(
3252
+ X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
3253
+ ):
3254
+ """
3255
+ # # Example usage:
3256
+ # X = np.random.rand(100, 10) # Example data with 100 samples and 10 features
3257
+ # y = np.random.randint(0, 2, 100) # Example binary target
3258
+ # # Using the function with different classifiers
3259
+ # # Random Forest example
3260
+ # plot_trees(X, y, RandomForestClassifier(), max_trees=100)
3261
+ # # Gradient Boosting with early stopping example
3262
+ # plot_trees(X, y, GradientBoostingClassifier(), max_trees=100, early_stopping_rounds=10)
3263
+ # # Extra Trees example
3264
+ # plot_trees(X, y, ExtraTreesClassifier(), max_trees=100)
3265
+ Master function to plot error rates (OOB, training, and testing) for different tree-based ensemble classifiers.
3266
+
3267
+ Parameters:
3268
+ - X (array-like): Feature matrix.
3269
+ - y (array-like): Target labels.
3270
+ - cls (object): Tree-based ensemble classifier instance (e.g., RandomForestClassifier()).
3271
+ - max_trees (int): Maximum number of trees to evaluate. Default is 500.
3272
+ - test_size (float): Proportion of data to use as test set for testing error. Default is 0.2.
3273
+ - random_state (int): Random state for reproducibility. Default is 42.
3274
+ - early_stopping_rounds (int): For boosting models only, stops training if validation error doesn't improve after specified rounds.
3275
+
3276
+ Returns:
3277
+ - None
3278
+ """
3279
+ from sklearn.model_selection import train_test_split
3280
+ from sklearn.metrics import accuracy_score
3281
+ from sklearn.ensemble import (
3282
+ RandomForestClassifier,
3283
+ BaggingClassifier,
3284
+ ExtraTreesClassifier,
3285
+ )
3286
+ from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
3287
+ # Split data for training and testing error calculation
3288
+ x_train, x_test, y_train, y_test = train_test_split(
3289
+ X, y, test_size=test_size, random_state=random_state
3290
+ )
3291
+
3292
+ # Initialize lists to store error rates
3293
+ oob_error_rate = []
3294
+ train_error_rate = []
3295
+ test_error_rate = []
3296
+ validation_error = None
3297
+
3298
+ # Configure classifier based on type
3299
+ oob_enabled = False # Default to no OOB error unless explicitly set
3300
+
3301
+ if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
3302
+ # Enable OOB if cls supports it and is using bootstrapping
3303
+ cls.set_params(warm_start=True, n_estimators=1)
3304
+ if hasattr(cls, "oob_score"):
3305
+ cls.set_params(bootstrap=True, oob_score=True)
3306
+ oob_enabled = True
3307
+ elif isinstance(cls, BaggingClassifier):
3308
+ cls.set_params(warm_start=True, bootstrap=True, oob_score=True, n_estimators=1)
3309
+ oob_enabled = True
3310
+ elif isinstance(cls, (AdaBoostClassifier, GradientBoostingClassifier)):
3311
+ cls.set_params(n_estimators=1)
3312
+ oob_enabled = False
3313
+ if early_stopping_rounds:
3314
+ validation_error = []
3315
+
3316
+ # Train and evaluate with an increasing number of trees
3317
+ for i in range(1, max_trees + 1):
3318
+ cls.set_params(n_estimators=i)
3319
+ cls.fit(x_train, y_train)
3320
+
3321
+ # Calculate OOB error (for models that support it)
3322
+ if oob_enabled and hasattr(cls, "oob_score_") and cls.oob_score:
3323
+ oob_error = 1 - cls.oob_score_
3324
+ oob_error_rate.append(oob_error)
3325
+
3326
+ # Calculate training error
3327
+ train_error = 1 - accuracy_score(y_train, cls.predict(x_train))
3328
+ train_error_rate.append(train_error)
3329
+
3330
+ # Calculate testing error
3331
+ test_error = 1 - accuracy_score(y_test, cls.predict(x_test))
3332
+ test_error_rate.append(test_error)
3333
+
3334
+ # For boosting models, use validation error with early stopping
3335
+ if early_stopping_rounds and isinstance(
3336
+ cls, (AdaBoostClassifier, GradientBoostingClassifier)
3337
+ ):
3338
+ val_error = 1 - accuracy_score(y_test, cls.predict(x_test))
3339
+ validation_error.append(val_error)
3340
+ if len(validation_error) > early_stopping_rounds:
3341
+ # Stop if validation error has not improved in early_stopping_rounds
3342
+ if validation_error[-early_stopping_rounds:] == sorted(
3343
+ validation_error[-early_stopping_rounds:]
3344
+ ):
3345
+ print(f"Early stopping at tree {i} due to lack of improvement in validation error.")
3346
+ break
3347
+
3348
+ # Plot results
3349
+ plt.figure(figsize=(10, 6))
3350
+ if oob_error_rate:
3351
+ plt.plot(
3352
+ range(1, len(oob_error_rate) + 1),
3353
+ oob_error_rate,
3354
+ color="black",
3355
+ label="OOB Error Rate",
3356
+ linewidth=2,
3357
+ )
3358
+ if train_error_rate:
3359
+ plt.plot(
3360
+ range(1, len(train_error_rate) + 1),
3361
+ train_error_rate,
3362
+ linestyle="dotted",
3363
+ color="green",
3364
+ label="Training Error Rate",
3365
+ )
3366
+ if test_error_rate:
3367
+ plt.plot(
3368
+ range(1, len(test_error_rate) + 1),
3369
+ test_error_rate,
3370
+ linestyle="dashed",
3371
+ color="red",
3372
+ label="Testing Error Rate",
3373
+ )
3374
+ if validation_error:
3375
+ plt.plot(
3376
+ range(1, len(validation_error) + 1),
3377
+ validation_error,
3378
+ linestyle="solid",
3379
+ color="blue",
3380
+ label="Validation Error (Boosting)",
3381
+ )
3382
+
3383
+ # Customize plot
3384
+ plt.xlabel("Number of Trees")
3385
+ plt.ylabel("Error Rate")
3386
+ plt.title(f"Error Rate Analysis for {cls.__class__.__name__}")
3387
+ plt.legend(loc="upper right")
3388
+ plt.grid(True)
3389
+ plt.show()
3390
+
3391
+ def img_datasets_preprocessing(
3392
+ data: pd.DataFrame,
3393
+ x_col: str,
3394
+ y_col: str=None,
3395
+ target_size: tuple = (224, 224),
3396
+ batch_size: int = 128,
3397
+ class_mode: str = "raw",
3398
+ shuffle: bool = False,
3399
+ augment: bool = False,
3400
+ scaler: str = 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
3401
+ grayscale: bool = False,
3402
+ encoder: str = "label", # Options: 'label', 'onehot', 'binary'
3403
+ label_encoder=None,
3404
+ kws_augmentation: dict = None,
3405
+ verbose: bool = True,
3406
+ drop_missing: bool = True,
3407
+ output="df", # "iterator":data_iterator,'df':return DataFrame
3408
+ ):
3409
+ """
3410
+ Enhanced preprocessing function for loading and preparing image data from a DataFrame.
3411
+
3412
+ Parameters:
3413
+ - df (pd.DataFrame): Input DataFrame with image paths and labels.
3414
+ - x_col (str): Column in `df` containing image file paths.
3415
+ - y_col (str): Column in `df` containing image labels.
3416
+ - target_size (tuple): Desired image size in (height, width).
3417
+ - batch_size (int): Number of images per batch.
3418
+ - class_mode (str): Mode of label ('raw', 'categorical', 'binary').
3419
+ - shuffle (bool): Shuffle the images in the DataFrame.
3420
+ - augment (bool): Apply data augmentation.
3421
+ - scaler (str): 'normalize', # 'normalize', 'standardize', 'clahe', 'raw'
3422
+ - grayscale (bool): Convert images to grayscale.
3423
+ - normalize (bool): Normalize image data to [0, 1] range.
3424
+ - encoder (str): Label encoder method ('label', 'onehot', 'binary').
3425
+ - label_encoder: Optional pre-defined label encoder.
3426
+ - kws_augmentation (dict): Parameters for data augmentation.
3427
+ - verbose (bool): Print status messages.
3428
+ - drop_missing (bool): Drop rows with missing or invalid image paths.
3429
+
3430
+ Returns:
3431
+ - pd.DataFrame: DataFrame with flattened image pixels and 'Label' column.
3432
+ """
3433
+ from tensorflow.keras.preprocessing.image import ImageDataGenerator
3434
+ from tensorflow.keras.utils import to_categorical
3435
+ from sklearn.preprocessing import LabelEncoder
3436
+ from PIL import Image
3437
+ import os
3438
+
3439
+ # Validate input DataFrame for required columns
3440
+ if y_col:
3441
+ assert (
3442
+ x_col in data.columns and y_col in data.columns
3443
+ ), "Missing required columns in DataFrame."
3444
+ if y_col is None:
3445
+ class_mode=None
3446
+ # 输出格式
3447
+ output = ips.strcmp(output,[
3448
+ "generator","tf","iterator","transform","transformer","dataframe",
3449
+ "df","pd","pandas"])[0]
3450
+
3451
+ # Handle missing file paths
3452
+ if drop_missing:
3453
+ data = data[
3454
+ data[x_col].apply(lambda path: os.path.exists(path) and os.path.isfile(path))
3455
+ ]
3456
+
3457
+ # Encoding labels if necessary
3458
+ if encoder and y_col is not None:
3459
+ if encoder == "binary":
3460
+ data[y_col] = (data[y_col] == data[y_col].unique()[0]).astype(int)
3461
+ elif encoder == "onehot":
3462
+ if not label_encoder:
3463
+ label_encoder = LabelEncoder()
3464
+ data[y_col] = label_encoder.fit_transform(data[y_col])
3465
+ data[y_col] = to_categorical(data[y_col])
3466
+ elif encoder == "label":
3467
+ if not label_encoder:
3468
+ label_encoder = LabelEncoder()
3469
+ data[y_col] = label_encoder.fit_transform(data[y_col])
3470
+
3471
+ # Set up data augmentation
3472
+ if augment:
3473
+ aug_params = {
3474
+ "rotation_range": 20,
3475
+ "width_shift_range": 0.2,
3476
+ "height_shift_range": 0.2,
3477
+ "shear_range": 0.2,
3478
+ "zoom_range": 0.2,
3479
+ "horizontal_flip": True,
3480
+ "fill_mode": "nearest",
3481
+ }
3482
+ if kws_augmentation:
3483
+ aug_params.update(kws_augmentation)
3484
+ dat = ImageDataGenerator(rescale=scaler, **aug_params)
3485
+ dat = ImageDataGenerator(
3486
+ rescale=1.0 / 255 if scaler == 'normalize' else None, **aug_params)
3487
+
3488
+ else:
3489
+ dat = ImageDataGenerator(
3490
+ rescale=1.0 / 255 if scaler == 'normalize' else None)
3491
+
3492
+ # Create DataFrameIterator
3493
+ data_iterator = dat.flow_from_dataframe(
3494
+ dataframe=data,
3495
+ x_col=x_col,
3496
+ y_col=y_col,
3497
+ target_size=target_size,
3498
+ color_mode="grayscale" if grayscale else "rgb",
3499
+ batch_size=batch_size,
3500
+ class_mode=class_mode,
3501
+ shuffle=shuffle,
3502
+ )
3503
+ print(f"target_size:{target_size}")
3504
+ if output.lower() in ["generator", "tf", "iterator", "transform", "transformer"]:
3505
+ return data_iterator
3506
+ elif output.lower() in ["dataframe", "df", "pd", "pandas"]:
3507
+ # Initialize list to collect processed data
3508
+ data_list = []
3509
+ total_batches = data_iterator.n // batch_size
3510
+
3511
+ # Load, resize, and process images in batches
3512
+ for i, (batch_images, batch_labels) in enumerate(data_iterator):
3513
+ for img, label in zip(batch_images, batch_labels):
3514
+ if scaler == ['normalize','raw']:
3515
+ # Already rescaled by 1.0/255 in ImageDataGenerator
3516
+ pass
3517
+ elif scaler == 'standardize':
3518
+ # Standardize by subtracting mean and dividing by std
3519
+ img = (img - np.mean(img)) / np.std(img)
3520
+ elif scaler == 'clahe':
3521
+ # Apply CLAHE to the image
3522
+ img = apply_clahe(img)
3523
+ flat_img = img.flatten()
3524
+ data_list.append(np.append(flat_img, label))
3525
+
3526
+ # Stop when all images have been processed
3527
+ if i >= total_batches:
3528
+ break
3529
+
3530
+ # Define column names for flattened image data
3531
+ pixel_count = target_size[0] * target_size[1] * (1 if grayscale else 3)
3532
+ column_names = [f"pixel_{i}" for i in range(pixel_count)] + ["Label"]
3533
+
3534
+ # Create DataFrame from flattened data
3535
+ df_img = pd.DataFrame(data_list, columns=column_names)
3536
+
3537
+ if verbose:
3538
+ print("Processed images:", len(df_img))
3539
+ print("Final DataFrame shape:", df_img.shape)
3540
+ display(df_img.head())
3541
+
3542
+ return df_img
3543
+ # Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
3544
+ def apply_clahe(img):
3545
+ import cv2
3546
+ lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB) # Convert to LAB color space
3547
+ l, a, b = cv2.split(lab) # Split into channels
3548
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
3549
+ cl = clahe.apply(l) # Apply CLAHE to the L channel
3550
+ limg = cv2.merge((cl, a, b)) # Merge back the channels
3551
+ img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB) # Convert back to RGB
3552
+ return img_clahe