PyPI - py2ls - Versions diffs - 0.2.4.14__py3-none-any.whl → 0.2.4.16__py3-none-any.whl - Mend

py2ls 0.2.4.14py3-none-any.whl → 0.2.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

py2ls/.git/index +0 -0
py2ls/ips.py +722 -12
py2ls/ml2ls copy.py +2906 -0
py2ls/ml2ls.py +898 -243
py2ls/plot.py +409 -24
py2ls/translator.py +2 -0
{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.16.dist-info}/METADATA +1 -1
{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.16.dist-info}/RECORD +9 -8
{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.16.dist-info}/WHEEL +0 -0

py2ls/ml2ls.py CHANGED Viewed

@@ -506,7 +506,7 @@ def get_models(
         "Support Vector Machine(svm)",
         "naive bayes",
         "Linear Discriminant Analysis (lda)",
-        "adaboost",
+        "AdaBoost",
         "DecisionTree",
         "KNeighbors",
         "Bagging",
@@ -585,7 +585,7 @@ def get_features(
         "Support Vector Machine(svm)",
         "naive bayes",
         "Linear Discriminant Analysis (lda)",
-        "adaboost",
+        "AdaBoost",
         "DecisionTree",
         "KNeighbors",
         "Bagging",
@@ -616,10 +616,10 @@ def get_features(
     if isinstance(y, str) and y in X.columns:
         y_col_name = y
         y = X[y]
-        y = ips.df_encoder(pd.DataFrame(y), method="dummy")
+        y = ips.df_encoder(pd.DataFrame(y), method="label")
         X = X.drop(y_col_name, axis=1)
     else:
-        y = ips.df_encoder(pd.DataFrame(y), method="dummy").values.ravel()
+        y = ips.df_encoder(pd.DataFrame(y), method="label").values.ravel()
     y = y.loc[X.index]  # Align y with X after dropping rows with missing values in X
     y = y.ravel() if isinstance(y, np.ndarray) else y.values.ravel()
@@ -699,9 +699,11 @@ def get_features(
         "Support Vector Machine(svm)",
         "Naive Bayes",
         "Linear Discriminant Analysis (lda)",
-        "adaboost",
+        "AdaBoost",
     ]
     cls = [ips.strcmp(i, cls_)[0] for i in cls]
+    feature_importances = {}
     # Lasso Feature Selection
     lasso_importances = (
@@ -712,6 +714,7 @@ def get_features(
     lasso_selected_features = (
         lasso_importances.head(n_features)["feature"].values if "lasso" in cls else []
     )
+    feature_importances['lasso']=lasso_importances.head(n_features)
     # Ridge
     ridge_importances = (
         features_ridge(x_train, y_train, ridge_params)
@@ -721,6 +724,7 @@ def get_features(
     selected_ridge_features = (
         ridge_importances.head(n_features)["feature"].values if "ridge" in cls else []
     )
+    feature_importances['ridge']=ridge_importances.head(n_features)
     # Elastic Net
     enet_importances = (
         features_enet(x_train, y_train, enet_params)
@@ -730,6 +734,7 @@ def get_features(
     selected_enet_features = (
         enet_importances.head(n_features)["feature"].values if "Enet" in cls else []
     )
+    feature_importances['Enet']=enet_importances.head(n_features)
     # Random Forest Feature Importance
     rf_importances = (
         features_rf(x_train, y_train, rf_params)
@@ -741,6 +746,7 @@ def get_features(
         if "Random Forest" in cls
         else []
     )
+    feature_importances['Random Forest']=rf_importances.head(n_features)
     # Gradient Boosting Feature Importance
     gb_importances = (
         features_gradient_boosting(x_train, y_train, gb_params)
@@ -752,6 +758,7 @@ def get_features(
         if "Gradient Boosting" in cls
         else []
     )
+    feature_importances['Gradient Boosting']=gb_importances.head(n_features)
     # xgb
     xgb_importances = (
         features_xgb(x_train, y_train, xgb_params) if "xgb" in cls else pd.DataFrame()
@@ -759,6 +766,7 @@ def get_features(
     top_xgb_features = (
         xgb_importances.head(n_features)["feature"].values if "xgb" in cls else []
     )
+    feature_importances['xgb']=xgb_importances.head(n_features)
     # SVM with RFE
     selected_svm_features = (
@@ -773,6 +781,7 @@ def get_features(
     selected_lda_features = (
         lda_importances.head(n_features)["feature"].values if "lda" in cls else []
     )
+    feature_importances['lda']=lda_importances.head(n_features)
     # AdaBoost Feature Importance
     adaboost_importances = (
         features_adaboost(x_train, y_train, adaboost_params)
@@ -784,6 +793,7 @@ def get_features(
         if "AdaBoost" in cls
         else []
     )
+    feature_importances['AdaBoost']=adaboost_importances.head(n_features)
     # Decision Tree Feature Importance
     dt_importances = (
         features_decision_tree(x_train, y_train, dt_params)
@@ -794,7 +804,8 @@ def get_features(
         dt_importances.head(n_features)["feature"].values
         if "Decision Tree" in cls
         else []
-    )
+    )
+    feature_importances['Decision Tree']=dt_importances.head(n_features)
     # Bagging Feature Importance
     bagging_importances = (
         features_bagging(x_train, y_train, bagging_params)
@@ -806,6 +817,7 @@ def get_features(
         if "Bagging" in cls
         else []
     )
+    feature_importances['Bagging']=bagging_importances.head(n_features)
     # KNN Feature Importance via Permutation
     knn_importances = (
         features_knn(x_train, y_train, knn_params) if "KNN" in cls else pd.DataFrame()
@@ -813,6 +825,7 @@ def get_features(
     top_knn_features = (
         knn_importances.head(n_features)["feature"].values if "KNN" in cls else []
     )
+    feature_importances['KNN']=knn_importances.head(n_features)
     #! Find common features
     common_features = ips.shared(
@@ -915,6 +928,7 @@ def get_features(
             "cv_train_scores": cv_train_results_df,
             "cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
             "common_features": list(common_features),
+            "feature_importances":feature_importances
         }
         if all([plot_, dir_save]):
             from datetime import datetime
@@ -927,6 +941,7 @@ def get_features(
             "cv_train_scores": pd.DataFrame(),
             "cv_test_scores": pd.DataFrame(),
             "common_features": [],
+            "feature_importances":{}
         }
         print(f"Warning: 没有找到共同的genes, when n_shared={n_shared}")
     return results
@@ -1217,142 +1232,335 @@ def validate_features(
 # # If you want to access validation scores
 # print(validation_results)
-def plot_validate_features(res_val):
+def plot_validate_features(res_val,is_binary=True,figsize=None):
     """
     plot the results of 'validate_features()'
     """
-    colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
-    if res_val.shape[0] > 5:
-        alpha = 0
-        figsize = [8, 10]
-        subplot_layout = [1, 2]
-        ncols = 2
-        bbox_to_anchor = [1.5, 0.6]
-    else:
-        alpha = 0.03
-        figsize = [10, 6]
-        subplot_layout = [1, 1]
-        ncols = 1
-        bbox_to_anchor = [1, 1]
-    nexttile = plot.subplot(figsize=figsize)
-    ax = nexttile(subplot_layout[0], subplot_layout[1])
-    for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-        fpr = res_val["roc_curve"][model_name]["fpr"]
-        tpr = res_val["roc_curve"][model_name]["tpr"]
-        (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
-        mean_auc = res_val["roc_curve"][model_name]["auc"]
-        plot_roc_curve(
-            fpr,
-            tpr,
-            mean_auc,
-            lower_ci,
-            upper_ci,
-            model_name=model_name,
-            lw=1.5,
-            color=colors[i],
-            alpha=alpha,
-            ax=ax,
+    if is_binary:
+        colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
+        if res_val.shape[0] > 5:
+            alpha = 0
+            figsize = [8, 10] if figsize is None else figsize
+            subplot_layout = [1, 2]
+            ncols = 2
+            bbox_to_anchor = [1.5, 0.6]
+        else:
+            alpha = 0.03
+            figsize = [10, 6] if figsize is None else figsize
+            subplot_layout = [1, 1]
+            ncols = 1
+            bbox_to_anchor = [1, 1]
+        nexttile = plot.subplot(figsize=figsize)
+        ax = nexttile(subplot_layout[0], subplot_layout[1])
+        for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+            fpr = res_val["roc_curve"][model_name]["fpr"]
+            tpr = res_val["roc_curve"][model_name]["tpr"]
+            (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
+            mean_auc = res_val["roc_curve"][model_name]["auc"]
+            plot_roc_curve(
+                fpr,
+                tpr,
+                mean_auc,
+                lower_ci,
+                upper_ci,
+                model_name=model_name,
+                lw=1.5,
+                color=colors[i],
+                alpha=alpha,
+                ax=ax,
+            )
+        plot.figsets(
+            sp=2,
+            legend=dict(
+                loc="upper right",
+                ncols=ncols,
+                fontsize=8,
+                bbox_to_anchor=[1.5, 0.6],
+                markerscale=0.8,
+            ),
         )
-    plot.figsets(
-        sp=2,
-        legend=dict(
-            loc="upper right",
-            ncols=ncols,
-            fontsize=8,
-            bbox_to_anchor=[1.5, 0.6],
-            markerscale=0.8,
-        ),
-    )
-    # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
-    ax = nexttile(subplot_layout[0], subplot_layout[1])
-    for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-        plot_pr_curve(
-            recall=res_val["pr_curve"][model_name]["recall"],
-            precision=res_val["pr_curve"][model_name]["precision"],
-            avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
-            model_name=model_name,
-            color=colors[i],
-            lw=1.5,
-            alpha=alpha,
-            ax=ax,
+        # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+        ax = nexttile(subplot_layout[0], subplot_layout[1])
+        for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+            plot_pr_curve(
+                recall=res_val["pr_curve"][model_name]["recall"],
+                precision=res_val["pr_curve"][model_name]["precision"],
+                avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
+                model_name=model_name,
+                color=colors[i],
+                lw=1.5,
+                alpha=alpha,
+                ax=ax,
+            )
+        plot.figsets(
+            sp=2,
+            legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
         )
-    plot.figsets(
-        sp=2,
-        legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
-    )
-    # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+        # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+    else:
+        colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
+        modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
+        classes=list(res_val["roc_curve"][modname_tmp]['fpr'].keys())
+        if res_val.shape[0] > 5:
+            alpha = 0
+            figsize = [8, 8*2*(len(classes))]  if figsize is None else figsize
+            subplot_layout = [1, 2]
+            ncols = 2
+            bbox_to_anchor = [1.5, 0.6]
+        else:
+            alpha = 0.03
+            figsize = [10, 6*(len(classes))] if figsize is None else figsize
+            subplot_layout = [1, 1]
+            ncols = 1
+            bbox_to_anchor = [1, 1]
+        nexttile = plot.subplot(2*(len(classes)),2,figsize=figsize)
+        for iclass, class_ in enumerate(classes):
+            ax = nexttile(subplot_layout[0], subplot_layout[1])
+            for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+                fpr = res_val["roc_curve"][model_name]["fpr"][class_]
+                tpr = res_val["roc_curve"][model_name]["tpr"][class_]
+                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
+                mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
+                plot_roc_curve(
+                    fpr,
+                    tpr,
+                    mean_auc,
+                    lower_ci,
+                    upper_ci,
+                    model_name=model_name,
+                    lw=1.5,
+                    color=colors[i],
+                    alpha=alpha,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=class_,
+                legend=dict(
+                    loc="upper right",
+                    ncols=ncols,
+                    fontsize=8,
+                    bbox_to_anchor=[1.5, 0.6],
+                    markerscale=0.8,
+                ),
+            )
+            # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+            ax = nexttile(subplot_layout[0], subplot_layout[1])
+            for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+                plot_pr_curve(
+                    recall=res_val["pr_curve"][model_name]["recall"][iclass],
+                    precision=res_val["pr_curve"][model_name]["precision"][iclass],
+                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"][iclass],
+                    model_name=model_name,
+                    color=colors[i],
+                    lw=1.5,
+                    alpha=alpha,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=class_,
+                legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
+            )
+def plot_validate_features_single(res_val, figsize=None,is_binary=True):
+    if is_binary:
+        if figsize is None:
+            nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3,figsize=[13,4*len(ips.flatten(res_val["pr_curve"].index))])
+        else:
+            nexttile = plot.subplot(
+                len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
+            )
+        for model_name in ips.flatten(res_val["pr_curve"].index):
+            fpr = res_val["roc_curve"][model_name]["fpr"]
+            tpr = res_val["roc_curve"][model_name]["tpr"]
+            (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
+            mean_auc = res_val["roc_curve"][model_name]["auc"]
+            # Plotting
+            plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci,
+                model_name=model_name, ax=nexttile())
+            plot.figsets(title=model_name, sp=2)
+            plot_pr_binary(
+                recall=res_val["pr_curve"][model_name]["recall"],
+                precision=res_val["pr_curve"][model_name]["precision"],
+                avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
+                model_name=model_name,
+                ax=nexttile(),
+            )
+            plot.figsets(title=model_name, sp=2)
-def plot_validate_features_single(res_val, figsize=None):
-    if figsize is None:
-        nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3)
+            # plot cm
+            plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
+            plot.figsets(title=model_name, sp=2)
     else:
-        nexttile = plot.subplot(
-            len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
-        )
-    for model_name in ips.flatten(res_val["pr_curve"].index):
-        fpr = res_val["roc_curve"][model_name]["fpr"]
-        tpr = res_val["roc_curve"][model_name]["tpr"]
-        (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
-        mean_auc = res_val["roc_curve"][model_name]["auc"]
-        # Plotting
-        plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci,
-            model_name=model_name, ax=nexttile())
-        plot.figsets(title=model_name, sp=2)
-        plot_pr_binary(
-            recall=res_val["pr_curve"][model_name]["recall"],
-            precision=res_val["pr_curve"][model_name]["precision"],
-            avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
-            model_name=model_name,
-            ax=nexttile(),
-        )
-        plot.figsets(title=model_name, sp=2)
+        modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
+        classes=list(res_val["roc_curve"][modname_tmp]['fpr'].keys())
+        if figsize is None:
+            nexttile = plot.subplot(len(modname_tmp), 3,figsize=[15,len(modname_tmp)*5])
+        else:
+            nexttile = plot.subplot(len(modname_tmp), 3, figsize=figsize)
+        colors = plot.get_color(len(classes))
+        for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+            ax = nexttile()
+            for iclass, class_ in enumerate(classes):
+                fpr = res_val["roc_curve"][model_name]["fpr"][class_]
+                tpr = res_val["roc_curve"][model_name]["tpr"][class_]
+                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
+                mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
+                plot_roc_curve(
+                    fpr,
+                    tpr,
+                    mean_auc,
+                    lower_ci,
+                    upper_ci,
+                    model_name=class_,
+                    lw=1.5,
+                    color=colors[iclass],
+                    alpha=0.03,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=model_name,
+                legend=dict(
+                    loc="best",
+                    fontsize=8,
+                ),
+            )
+            ax = nexttile()
+            for iclass, class_ in enumerate(classes):
+                plot_pr_curve(
+                    recall=res_val["pr_curve"][model_name]["recall"][iclass],
+                    precision=res_val["pr_curve"][model_name]["precision"][iclass],
+                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"][iclass],
+                    model_name=class_,
+                    color=colors[iclass],
+                    lw=1.5,
+                    alpha=0.03,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=class_,
+                legend=dict(loc="best", fontsize=8),
+            )
+            plot_cm(res_val["confusion_matrix"][model_name],labels_name=classes, ax=nexttile(), normalize=False)
+            plot.figsets(title=model_name, sp=2)
-        # plot cm
-        plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
-        plot.figsets(title=model_name, sp=2)
+def cal_precision_recall(
+    y_true, y_pred_proba, is_binary=True):
+    if is_binary:
+        precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
+        avg_precision_ = average_precision_score(y_true, y_pred_proba)
+        return precision_, recall_,avg_precision_
+    else:
+        n_classes = y_pred_proba.shape[1]  # Number of classes
+        precision_ = []
+        recall_ = []
+        # One-vs-rest approach for multi-class precision-recall curve
+        for class_idx in range(n_classes):
+            precision, recall, _ = precision_recall_curve(
+                (y_true == class_idx).astype(int),  # Binarize true labels for the current class
+                y_pred_proba[:, class_idx],  # Probabilities for the current class
+            )
+            precision_.append(precision)
+            recall_.append(recall)
+        # Optionally, you can compute average precision for each class
+        avg_precision_ = []
+        for class_idx in range(n_classes):
+            avg_precision = average_precision_score(
+                (y_true == class_idx).astype(int),  # Binarize true labels for the current class
+                y_pred_proba[:, class_idx],  # Probabilities for the current class
+            )
+            avg_precision_.append(avg_precision)
+        return precision_, recall_,avg_precision_
 def cal_auc_ci(
-    y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1, verbose=True
+    y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1,is_binary=True, verbose=True
 ):
-    y_true = np.asarray(y_true)
-    y_pred = np.asarray(y_pred)
-    bootstrapped_scores = []
-    if verbose:
-        print("auroc score:", roc_auc_score(y_true, y_pred))
-    rng = np.random.RandomState(random_state)
-    for i in range(n_bootstraps):
-        # bootstrap by sampling with replacement on the prediction indices
-        indices = rng.randint(0, len(y_pred), len(y_pred))
-        if len(np.unique(y_true[indices])) < 2:
-            # We need at least one positive and one negative sample for ROC AUC
-            # to be defined: reject the sample
-            continue
-        if isinstance(y_true, np.ndarray):
-            score = roc_auc_score(y_true[indices], y_pred[indices])
-        else:
-            score = roc_auc_score(y_true.iloc[indices], y_pred.iloc[indices])
-        bootstrapped_scores.append(score)
-        # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
-    sorted_scores = np.array(bootstrapped_scores)
-    sorted_scores.sort()
-    # Computing the lower and upper bound of the 90% confidence interval
-    # You can change the bounds percentiles to 0.025 and 0.975 to get
-    # a 95% confidence interval instead.
-    confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
-    confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
-    if verbose:
-        print(
-            "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
-                confidence_lower, confidence_upper
+    if is_binary:
+        y_true = np.asarray(y_true)
+        y_pred = np.asarray(y_pred)
+        bootstrapped_scores = []
+        if verbose:
+            print("auroc score:", roc_auc_score(y_true, y_pred))
+        rng = np.random.RandomState(random_state)
+        for i in range(n_bootstraps):
+            # bootstrap by sampling with replacement on the prediction indices
+            indices = rng.randint(0, len(y_pred), len(y_pred))
+            if len(np.unique(y_true[indices])) < 2:
+                # We need at least one positive and one negative sample for ROC AUC
+                # to be defined: reject the sample
+                continue
+            if isinstance(y_true, np.ndarray):
+                score = roc_auc_score(y_true[indices], y_pred[indices])
+            else:
+                score = roc_auc_score(y_true.iloc[indices], y_pred.iloc[indices])
+            bootstrapped_scores.append(score)
+            # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
+        sorted_scores = np.array(bootstrapped_scores)
+        sorted_scores.sort()
+        # Computing the lower and upper bound of the 90% confidence interval
+        # You can change the bounds percentiles to 0.025 and 0.975 to get
+        # a 95% confidence interval instead.
+        confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
+        confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
+        if verbose:
+            print(
+                "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
+                    confidence_lower, confidence_upper
+                )
             )
-        )
-    return confidence_lower, confidence_upper
+        return confidence_lower, confidence_upper
+    else:
+        from sklearn.preprocessing import label_binarize
+        # Multi-class classification case
+        y_true = np.asarray(y_true)
+        y_pred = np.asarray(y_pred)
+        # Binarize the multi-class labels for OvR computation
+        y_true_bin = label_binarize(y_true, classes=np.unique(y_true))  # One-vs-Rest transformation
+        n_classes = y_true_bin.shape[1]  # Number of classes
+        bootstrapped_scores = np.zeros((n_classes, n_bootstraps))  # Store scores for each class
+        if verbose:
+            print("AUROC scores for each class:")
+            for i in range(n_classes):
+                print(f"Class {i}: {roc_auc_score(y_true_bin[:, i], y_pred[:, i])}")
+        rng = np.random.RandomState(random_state)
+        for i in range(n_bootstraps):
+            indices = rng.randint(0, len(y_pred), len(y_pred))
+            for class_idx in range(n_classes):
+                if len(np.unique(y_true_bin[indices, class_idx])) < 2:
+                    continue  # Reject if the class doesn't have both positive and negative samples
+                score = roc_auc_score(y_true_bin[indices, class_idx], y_pred[indices, class_idx])
+                bootstrapped_scores[class_idx, i] = score
+        # Calculating the confidence intervals for each class
+        confidence_intervals = []
+        for class_idx in range(n_classes):
+            sorted_scores = np.sort(bootstrapped_scores[class_idx])
+            confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
+            confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
+            confidence_intervals.append((confidence_lower, confidence_upper))
+            if verbose:
+                print(f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]")
+        return confidence_intervals
 def plot_roc_curve(
@@ -1517,7 +1725,7 @@ def plot_pr_binary(
     pr_boundary = interp1d(recall, precision, kind="linear", fill_value="extrapolate")
     for f_score in f_scores:
-        x_vals = np.linspace(0.01, 1, 10000)
+        x_vals = np.linspace(0.01, 1, 20000)
         y_vals = f_score * x_vals / (2 * x_vals - f_score)
         y_vals_clipped = np.minimum(y_vals, pr_boundary(x_vals))
         y_vals_clipped = np.clip(y_vals_clipped, 1e-3, None)  # Prevent going to zero
@@ -1553,7 +1761,7 @@ def plot_pr_binary(
 def plot_cm(
     cm,
     labels_name=None,
-    thresh=0.8,
+    thresh=0.8, # for set color
     axis_labels=None,
     cmap="Reds",
     normalize=True,
@@ -2029,11 +2237,21 @@ def predict(
     if isinstance(y_train, str) and y_train in x_train.columns:
         y_train_col_name = y_train
         y_train = x_train[y_train]
-        y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
+        # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
         x_train = x_train.drop(y_train_col_name, axis=1)
+    # else:
+    #     y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
+    y_train=pd.DataFrame(y_train)
+    if y_train.select_dtypes(include=np.number).empty:
+        y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
+        is_binary = False if y_train_.shape[1] >2 else True
     else:
-        y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
+        y_train_=ips.flatten(y_train.values)
+        is_binary = False if len(y_train_)>2 else True
+    if is_binary:
+        y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
+    print('is_binary:',is_binary)
     if x_true is None:
         x_train, x_true, y_train, y_true = train_test_split(
             x_train,
@@ -2042,23 +2260,27 @@ def predict(
             random_state=random_state,
             stratify=y_train if purpose == "classification" else None,
         )
         if isinstance(y_train, str) and y_train in x_train.columns:
             y_train_col_name = y_train
             y_train = x_train[y_train]
-            y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
+            y_train = ips.df_encoder(pd.DataFrame(y_train), method="label") if is_binary else y_train
             x_train = x_train.drop(y_train_col_name, axis=1)
-        else:
+        if is_binary:
             y_train = ips.df_encoder(
-                pd.DataFrame(y_train), method="dummy"
-            ).values.ravel()
+                pd.DataFrame(y_train), method="label"
+            ).values.ravel()
     if y_true is not None:
         if isinstance(y_true, str) and y_true in x_true.columns:
             y_true_col_name = y_true
             y_true = x_true[y_true]
-            y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy")
+            y_true = ips.df_encoder(pd.DataFrame(y_true), method="label") if is_binary else y_true
+            y_true =  pd.DataFrame(y_true)
             x_true = x_true.drop(y_true_col_name, axis=1)
-        else:
-            y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy").values.ravel()
+        if is_binary:
+            y_true = ips.df_encoder(pd.DataFrame(y_true), method="label").values.ravel()
+            y_true =  pd.DataFrame(y_true)
     # to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
@@ -2068,7 +2290,6 @@ def predict(
         y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
     )
     y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
     # Ensure common features are selected
     if common_features is not None:
         x_train, x_true = x_train[common_features], x_true[common_features]
@@ -2077,10 +2298,7 @@ def predict(
         x_train, x_true = x_train[share_col_names], x_true[share_col_names]
     x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
-    x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
-        x_true, method="dummy"
-    )
+    x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(x_true, method="dummy")
     # Handle class imbalance using SMOTE (only for classification)
     if (
         smote
@@ -2091,7 +2309,13 @@ def predict(
         smote_sampler = SMOTE(random_state=random_state)
         x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
+    if not is_binary:
+        if isinstance(y_train, np.ndarray):
+            y_train = ips.df_encoder(data=pd.DataFrame(y_train),method='label')
+            y_train=np.asarray(y_train)
+        if isinstance(y_train, np.ndarray):
+            y_true = ips.df_encoder(data=pd.DataFrame(y_true),method='label')
+            y_true=np.asarray(y_true)
     # Hyperparameter grids for tuning
     if cv_level in ["low", "simple", "s", "l"]:
         param_grids = {
@@ -2670,95 +2894,181 @@ def predict(
             print(f"\nTraining and validating {name}:")
         # Grid search with KFold or StratifiedKFold
-        gs = GridSearchCV(
-            clf,
-            param_grid=param_grids.get(name, {}),
-            scoring=(
-                "roc_auc" if purpose == "classification" else "neg_mean_squared_error"
-            ),
-            cv=cv,
-            n_jobs=n_jobs,
-            verbose=verbose,
-        )
-        gs.fit(x_train, y_train)
-        best_clf = gs.best_estimator_
-        # make sure x_train and x_test has the same name
-        x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
-        y_pred = best_clf.predict(x_true)
-        # y_pred_proba
-        if hasattr(best_clf, "predict_proba"):
-            y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
-        elif hasattr(best_clf, "decision_function"):
-            # If predict_proba is not available, use decision_function (e.g., for SVM)
-            y_pred_proba = best_clf.decision_function(x_true)
-            # Ensure y_pred_proba is within 0 and 1 bounds
-            y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
-                y_pred_proba.max() - y_pred_proba.min()
+        if is_binary:
+            gs = GridSearchCV(
+                clf,
+                param_grid=param_grids.get(name, {}),
+                scoring=(
+                    "roc_auc" if purpose == "classification" else "neg_mean_squared_error"
+                ),
+                cv=cv,
+                n_jobs=n_jobs,
+                verbose=verbose,
             )
+            gs.fit(x_train, y_train)
+            best_clf = gs.best_estimator_
+            # make sure x_train and x_test has the same name
+            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+            y_pred = best_clf.predict(x_true)
+            if hasattr(best_clf, "predict_proba"):
+                y_pred_proba = best_clf.predict_proba(x_true)
+                print("Shape of predicted probabilities:", y_pred_proba.shape)
+                if y_pred_proba.shape[1] == 1:
+                    y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba])  # Add missing class probabilities
+                y_pred_proba = y_pred_proba[:, 1]
+            elif hasattr(best_clf, "decision_function"):
+                # If predict_proba is not available, use decision_function (e.g., for SVM)
+                y_pred_proba = best_clf.decision_function(x_true)
+                # Ensure y_pred_proba is within 0 and 1 bounds
+                y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
+                    y_pred_proba.max() - y_pred_proba.min()
+                )
+            else:
+                y_pred_proba = None  # No probability output for certain models
         else:
-            y_pred_proba = None  # No probability output for certain models
+            gs = GridSearchCV(
+                clf,
+                param_grid=param_grids.get(name, {}),
+                scoring=(
+                    "roc_auc_ovr" if purpose == "classification" else "neg_mean_squared_error"
+                ),
+                cv=cv,
+                n_jobs=n_jobs,
+                verbose=verbose,
+            )
+            # Fit GridSearchCV
+            gs.fit(x_train, y_train)
+            best_clf = gs.best_estimator_
+            # Ensure x_true aligns with x_train columns
+            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+            y_pred = best_clf.predict(x_true)
+            # Handle prediction probabilities for multiclass
+            if hasattr(best_clf, "predict_proba"):
+                y_pred_proba = best_clf.predict_proba(x_true)
+            elif hasattr(best_clf, "decision_function"):
+                y_pred_proba = best_clf.decision_function(x_true)
+                # Normalize for multiclass if necessary
+                if y_pred_proba.ndim == 2:
+                    y_pred_proba = (y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)) / \
+                                (y_pred_proba.max(axis=1, keepdims=True) - y_pred_proba.min(axis=1, keepdims=True))
+            else:
+                y_pred_proba = None  # No probability output for certain models
         validation_scores = {}
-        if y_true is not None:
+        if y_true is not None and y_pred_proba is not None:
             validation_scores = cal_metrics(
                 y_true,
                 y_pred,
                 y_pred_proba=y_pred_proba,
+                is_binary=is_binary,
                 purpose=purpose,
                 average="weighted",
             )
-            # Calculate ROC curve
-            # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
-            if y_pred_proba is not None:
-                # fpr, tpr, roc_auc = dict(), dict(), dict()
-                fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
-                lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False)
-                roc_auc = auc(fpr, tpr)
-                roc_info = {
-                    "fpr": fpr.tolist(),
-                    "tpr": tpr.tolist(),
-                    "auc": roc_auc,
-                    "ci95": (lower_ci, upper_ci),
-                }
-                # precision-recall curve
-                precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
-                avg_precision_ = average_precision_score(y_true, y_pred_proba)
-                pr_info = {
-                    "precision": precision_,
-                    "recall": recall_,
-                    "avg_precision": avg_precision_,
-                }
-            else:
-                roc_info, pr_info = None, None
-            if purpose == "classification":
-                results[name] = {
-                    "best_clf": gs.best_estimator_,
-                    "best_params": gs.best_params_,
-                    "auc_indiv": [
-                        gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
-                        for i in range(cv_folds)
-                    ],
-                    "scores": validation_scores,
-                    "roc_curve": roc_info,
-                    "pr_curve": pr_info,
-                    "confusion_matrix": confusion_matrix(y_true, y_pred),
-                    "predictions": y_pred.tolist(),
-                    "predictions_proba": (
-                        y_pred_proba.tolist() if y_pred_proba is not None else None
-                    ),
-                }
-            else:  # "regression"
-                results[name] = {
-                    "best_clf": gs.best_estimator_,
-                    "best_params": gs.best_params_,
-                    "scores": validation_scores,  # e.g., neg_MSE, R², etc.
-                    "predictions": y_pred.tolist(),
-                    "predictions_proba": (
-                        y_pred_proba.tolist() if y_pred_proba is not None else None
-                    ),
-                }
+            if is_binary:
+                # Calculate ROC curve
+                # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
+                if y_pred_proba is not None:
+                    # fpr, tpr, roc_auc = dict(), dict(), dict()
+                    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+                    lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False,is_binary=is_binary)
+                    roc_auc = auc(fpr, tpr)
+                    roc_info = {
+                        "fpr": fpr.tolist(),
+                        "tpr": tpr.tolist(),
+                        "auc": roc_auc,
+                        "ci95": (lower_ci, upper_ci),
+                    }
+                    # precision-recall curve
+                    precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
+                    avg_precision_ = average_precision_score(y_true, y_pred_proba)
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                if purpose == "classification":
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "auc_indiv": [
+                            gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                            for i in range(cv_folds)
+                        ],
+                        "scores": validation_scores,
+                        "roc_curve": roc_info,
+                        "pr_curve": pr_info,
+                        "confusion_matrix": confusion_matrix(y_true, y_pred),
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
+                else:  # "regression"
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "scores": validation_scores,  # e.g., neg_MSE, R², etc.
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
+            else: # multi-classes
+                if y_pred_proba is not None:
+                    # fpr, tpr, roc_auc = dict(), dict(), dict()
+                    # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+                    confidence_intervals = cal_auc_ci(y_true, y_pred_proba, verbose=False,is_binary=is_binary)
+                    roc_info = {
+                        "fpr": validation_scores["fpr"],
+                        "tpr": validation_scores["tpr"],
+                        "auc": validation_scores["roc_auc_by_class"],
+                        "ci95": confidence_intervals,
+                    }
+                    # precision-recall curve
+                    precision_, recall_, avg_precision_ = cal_precision_recall(y_true, y_pred_proba,is_binary=is_binary)
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                if purpose == "classification":
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "auc_indiv": [
+                            gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                            for i in range(cv_folds)
+                        ],
+                        "scores": validation_scores,
+                        "roc_curve": roc_info,
+                        "pr_curve": pr_info,
+                        "confusion_matrix": confusion_matrix(y_true, y_pred),
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
+                else:  # "regression"
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "scores": validation_scores,  # e.g., neg_MSE, R², etc.
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
         else:
             results[name] = {
@@ -2773,7 +3083,6 @@ def predict(
     # Convert results to DataFrame
     df_results = pd.DataFrame.from_dict(results, orient="index")
     # sort
     if y_true is not None and purpose == "classification":
         df_scores = pd.DataFrame(
@@ -2790,26 +3099,29 @@ def predict(
             plot.figsets(xangle=30)
             if dir_save:
                 ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
+            df_scores=df_scores.select_dtypes(include=np.number)
             if df_scores.shape[0] > 1:  # draw cluster
                 plot.heatmap(df_scores, kind="direct", cluster=True)
                 plot.figsets(xangle=30)
                 if dir_save:
                     ips.figsave(dir_save + f"scores_clus{now_}.pdf")
     if all([plot_, y_true is not None, purpose == "classification"]):
-        try:
-            if len(models) > 3:
-                plot_validate_features(df_results)
-            else:
-                plot_validate_features_single(df_results, figsize=(12, 4 * len(models)))
-            if dir_save:
-                ips.figsave(dir_save + f"validate_features{now_}.pdf")
-        except Exception as e:
-            print(f"Error: 在画图的过程中出现了问题:{e}")
+        # try:
+        if len(models) > 3:
+            plot_validate_features(df_results,is_binary=is_binary)
+        else:
+            plot_validate_features_single(df_results, is_binary=is_binary)
+        if dir_save:
+            ips.figsave(dir_save + f"validate_features{now_}.pdf")
+        # except Exception as e:
+        #     print(f"Error: 在画图的过程中出现了问题:{e}")
     return df_results
 def cal_metrics(
-    y_true, y_pred, y_pred_proba=None, purpose="regression", average="weighted"
+    y_true, y_pred, y_pred_proba=None, is_binary=True,purpose="regression", average="weighted"
 ):
     """
     Calculate regression or classification metrics based on the purpose.
@@ -2879,19 +3191,362 @@ def cal_metrics(
         }
         # Confusion matrix to calculate specificity
-        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
-        validation_scores["specificity"] = (
-            tn / (tn + fp) if (tn + fp) > 0 else 0
-        )  # Specificity calculation
+        if is_binary:
+            cm = confusion_matrix(y_true, y_pred)
+            if cm.size == 4:
+                tn, fp, fn, tp = cm.ravel()
+            else:
+                # Handle single-class predictions
+                tn, fp, fn, tp = 0, 0, 0, 0
+                print("Warning: Only one class found in y_pred or y_true.")
+            # Specificity calculation
+            validation_scores["specificity"] = (
+                tn / (tn + fp) if (tn + fp) > 0 else 0
+            )
+            if y_pred_proba is not None:
+                # Calculate ROC-AUC
+                validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
+                # PR-AUC (Precision-Recall AUC) calculation
+                validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
+        else: # multi-class
+            from sklearn.preprocessing import label_binarize
+            #* Multi-class ROC calculation
+            y_pred_proba = np.asarray(y_pred_proba)
+            classes = np.unique(y_true)
+            y_true_bin = label_binarize(y_true, classes=classes)
+            if isinstance(y_true, np.ndarray):
+                y_true = ips.df_encoder(data=pd.DataFrame(y_true), method='dum',prefix='Label')
+            # Initialize dictionaries to store FPR, TPR, and AUC for each class
+            fpr = dict()
+            tpr = dict()
+            roc_auc = dict()
+            for i, class_label in enumerate(classes):
+                fpr[class_label], tpr[class_label], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
+                roc_auc[class_label] = auc(fpr[class_label], tpr[class_label])
+            # Store the mean ROC AUC
+            try:
+                validation_scores["roc_auc"] = roc_auc_score(
+                    y_true, y_pred_proba, multi_class="ovr", average=average
+                )
+            except Exception as e:
+                y_pred_proba = y_pred_proba / y_pred_proba.sum(axis=1, keepdims=True)
+                validation_scores["roc_auc"] = roc_auc_score(
+                    y_true, y_pred_proba, multi_class="ovr", average=average
+                )
+            validation_scores["roc_auc_by_class"] = roc_auc  # Individual class AUCs
+            validation_scores["fpr"] = fpr
+            validation_scores["tpr"] = tpr
-        if y_pred_proba is not None:
-            # Calculate ROC-AUC
-            validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
-            # PR-AUC (Precision-Recall AUC) calculation
-            validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
     else:
         raise ValueError(
             "Invalid purpose specified. Choose 'regression' or 'classification'."
         )
     return validation_scores
+def plot_trees(
+    X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
+):
+    """
+    # # Example usage:
+        # X = np.random.rand(100, 10)  # Example data with 100 samples and 10 features
+        # y = np.random.randint(0, 2, 100)  # Example binary target
+        # # Using the function with different classifiers
+        # # Random Forest example
+        # plot_trees(X, y, RandomForestClassifier(), max_trees=100)
+        # # Gradient Boosting with early stopping example
+        # plot_trees(X, y, GradientBoostingClassifier(), max_trees=100, early_stopping_rounds=10)
+        # # Extra Trees example
+        # plot_trees(X, y, ExtraTreesClassifier(), max_trees=100)
+    Master function to plot error rates (OOB, training, and testing) for different tree-based ensemble classifiers.
+    Parameters:
+    - X (array-like): Feature matrix.
+    - y (array-like): Target labels.
+    - cls (object): Tree-based ensemble classifier instance (e.g., RandomForestClassifier()).
+    - max_trees (int): Maximum number of trees to evaluate. Default is 500.
+    - test_size (float): Proportion of data to use as test set for testing error. Default is 0.2.
+    - random_state (int): Random state for reproducibility. Default is 42.
+    - early_stopping_rounds (int): For boosting models only, stops training if validation error doesn't improve after specified rounds.
+    Returns:
+    - None
+    """
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import accuracy_score
+    from sklearn.ensemble import (
+        RandomForestClassifier,
+        BaggingClassifier,
+        ExtraTreesClassifier,
+    )
+    from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
+    # Split data for training and testing error calculation
+    x_train, x_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=random_state
+    )
+    # Initialize lists to store error rates
+    oob_error_rate = []
+    train_error_rate = []
+    test_error_rate = []
+    validation_error = None
+    # Configure classifier based on type
+    oob_enabled = False  # Default to no OOB error unless explicitly set
+    if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
+        # Enable OOB if cls supports it and is using bootstrapping
+        cls.set_params(warm_start=True, n_estimators=1)
+        if hasattr(cls, "oob_score"):
+            cls.set_params(bootstrap=True, oob_score=True)
+            oob_enabled = True
+    elif isinstance(cls, BaggingClassifier):
+        cls.set_params(warm_start=True, bootstrap=True, oob_score=True, n_estimators=1)
+        oob_enabled = True
+    elif isinstance(cls, (AdaBoostClassifier, GradientBoostingClassifier)):
+        cls.set_params(n_estimators=1)
+        oob_enabled = False
+        if early_stopping_rounds:
+            validation_error = []
+    # Train and evaluate with an increasing number of trees
+    for i in range(1, max_trees + 1):
+        cls.set_params(n_estimators=i)
+        cls.fit(x_train, y_train)
+        # Calculate OOB error (for models that support it)
+        if oob_enabled and hasattr(cls, "oob_score_") and cls.oob_score:
+            oob_error = 1 - cls.oob_score_
+            oob_error_rate.append(oob_error)
+        # Calculate training error
+        train_error = 1 - accuracy_score(y_train, cls.predict(x_train))
+        train_error_rate.append(train_error)
+        # Calculate testing error
+        test_error = 1 - accuracy_score(y_test, cls.predict(x_test))
+        test_error_rate.append(test_error)
+        # For boosting models, use validation error with early stopping
+        if early_stopping_rounds and isinstance(
+            cls, (AdaBoostClassifier, GradientBoostingClassifier)
+        ):
+            val_error = 1 - accuracy_score(y_test, cls.predict(x_test))
+            validation_error.append(val_error)
+            if len(validation_error) > early_stopping_rounds:
+                # Stop if validation error has not improved in early_stopping_rounds
+                if validation_error[-early_stopping_rounds:] == sorted(
+                    validation_error[-early_stopping_rounds:]
+                ):
+                    print(f"Early stopping at tree {i} due to lack of improvement in validation error.")
+                    break
+    # Plot results
+    plt.figure(figsize=(10, 6))
+    if oob_error_rate:
+        plt.plot(
+            range(1, len(oob_error_rate) + 1),
+            oob_error_rate,
+            color="black",
+            label="OOB Error Rate",
+            linewidth=2,
+        )
+    if train_error_rate:
+        plt.plot(
+            range(1, len(train_error_rate) + 1),
+            train_error_rate,
+            linestyle="dotted",
+            color="green",
+            label="Training Error Rate",
+        )
+    if test_error_rate:
+        plt.plot(
+            range(1, len(test_error_rate) + 1),
+            test_error_rate,
+            linestyle="dashed",
+            color="red",
+            label="Testing Error Rate",
+        )
+    if validation_error:
+        plt.plot(
+            range(1, len(validation_error) + 1),
+            validation_error,
+            linestyle="solid",
+            color="blue",
+            label="Validation Error (Boosting)",
+        )
+    # Customize plot
+    plt.xlabel("Number of Trees")
+    plt.ylabel("Error Rate")
+    plt.title(f"Error Rate Analysis for {cls.__class__.__name__}")
+    plt.legend(loc="upper right")
+    plt.grid(True)
+    plt.show()
+def img_datasets_preprocessing(
+    data: pd.DataFrame,
+    x_col: str,
+    y_col: str=None,
+    target_size: tuple = (224, 224),
+    batch_size: int = 128,
+    class_mode: str = "raw",
+    shuffle: bool = False,
+    augment: bool = False,
+    scaler: str = 'normalize',  # 'normalize', 'standardize', 'clahe', 'raw'
+    grayscale: bool = False,
+    encoder: str = "label",  # Options: 'label', 'onehot', 'binary'
+    label_encoder=None,
+    kws_augmentation: dict = None,
+    verbose: bool = True,
+    drop_missing: bool = True,
+    output="df",  # "iterator":data_iterator,'df':return DataFrame
+):
+    """
+    Enhanced preprocessing function for loading and preparing image data from a DataFrame.
+    Parameters:
+    - df (pd.DataFrame): Input DataFrame with image paths and labels.
+    - x_col (str): Column in `df` containing image file paths.
+    - y_col (str): Column in `df` containing image labels.
+    - target_size (tuple): Desired image size in (height, width).
+    - batch_size (int): Number of images per batch.
+    - class_mode (str): Mode of label ('raw', 'categorical', 'binary').
+    - shuffle (bool): Shuffle the images in the DataFrame.
+    - augment (bool): Apply data augmentation.
+    - scaler (str): 'normalize',  # 'normalize', 'standardize', 'clahe', 'raw'
+    - grayscale (bool): Convert images to grayscale.
+    - normalize (bool): Normalize image data to [0, 1] range.
+    - encoder (str): Label encoder method ('label', 'onehot', 'binary').
+    - label_encoder: Optional pre-defined label encoder.
+    - kws_augmentation (dict): Parameters for data augmentation.
+    - verbose (bool): Print status messages.
+    - drop_missing (bool): Drop rows with missing or invalid image paths.
+    Returns:
+    - pd.DataFrame: DataFrame with flattened image pixels and 'Label' column.
+    """
+    from tensorflow.keras.preprocessing.image import ImageDataGenerator
+    from tensorflow.keras.utils import to_categorical
+    from sklearn.preprocessing import LabelEncoder
+    from PIL import Image
+    import os
+    # Validate input DataFrame for required columns
+    if y_col:
+        assert (
+            x_col in data.columns and y_col in data.columns
+        ), "Missing required columns in DataFrame."
+    if y_col is None:
+        class_mode=None
+    # 输出格式
+    output = ips.strcmp(output,[
+        "generator","tf","iterator","transform","transformer","dataframe",
+         "df","pd","pandas"])[0]
+    # Handle missing file paths
+    if drop_missing:
+        data = data[
+            data[x_col].apply(lambda path: os.path.exists(path) and os.path.isfile(path))
+        ]
+    # Encoding labels if necessary
+    if encoder and y_col is not None:
+        if encoder == "binary":
+            data[y_col] = (data[y_col] == data[y_col].unique()[0]).astype(int)
+        elif encoder == "onehot":
+            if not label_encoder:
+                label_encoder = LabelEncoder()
+                data[y_col] = label_encoder.fit_transform(data[y_col])
+            data[y_col] = to_categorical(data[y_col])
+        elif encoder == "label":
+            if not label_encoder:
+                label_encoder = LabelEncoder()
+                data[y_col] = label_encoder.fit_transform(data[y_col])
+    # Set up data augmentation
+    if augment:
+        aug_params = {
+            "rotation_range": 20,
+            "width_shift_range": 0.2,
+            "height_shift_range": 0.2,
+            "shear_range": 0.2,
+            "zoom_range": 0.2,
+            "horizontal_flip": True,
+            "fill_mode": "nearest",
+        }
+        if kws_augmentation:
+            aug_params.update(kws_augmentation)
+        dat = ImageDataGenerator(rescale=scaler, **aug_params)
+        dat = ImageDataGenerator(
+        rescale=1.0 / 255 if scaler == 'normalize' else None, **aug_params)
+    else:
+        dat = ImageDataGenerator(
+        rescale=1.0 / 255 if scaler == 'normalize' else None)
+    # Create DataFrameIterator
+    data_iterator = dat.flow_from_dataframe(
+        dataframe=data,
+        x_col=x_col,
+        y_col=y_col,
+        target_size=target_size,
+        color_mode="grayscale" if grayscale else "rgb",
+        batch_size=batch_size,
+        class_mode=class_mode,
+        shuffle=shuffle,
+    )
+    print(f"target_size:{target_size}")
+    if output.lower() in ["generator", "tf", "iterator", "transform", "transformer"]:
+        return data_iterator
+    elif output.lower() in ["dataframe", "df", "pd", "pandas"]:
+        # Initialize list to collect processed data
+        data_list = []
+        total_batches = data_iterator.n // batch_size
+        # Load, resize, and process images in batches
+        for i, (batch_images, batch_labels) in enumerate(data_iterator):
+            for img, label in zip(batch_images, batch_labels):
+                if scaler == ['normalize','raw']:
+                    # Already rescaled by 1.0/255 in ImageDataGenerator
+                    pass
+                elif scaler == 'standardize':
+                    # Standardize by subtracting mean and dividing by std
+                    img = (img - np.mean(img)) / np.std(img)
+                elif scaler == 'clahe':
+                    # Apply CLAHE to the image
+                    img = apply_clahe(img)
+                flat_img = img.flatten()
+                data_list.append(np.append(flat_img, label))
+            # Stop when all images have been processed
+            if i >= total_batches:
+                break
+        # Define column names for flattened image data
+        pixel_count = target_size[0] * target_size[1] * (1 if grayscale else 3)
+        column_names = [f"pixel_{i}" for i in range(pixel_count)] + ["Label"]
+        # Create DataFrame from flattened data
+        df_img = pd.DataFrame(data_list, columns=column_names)
+        if verbose:
+            print("Processed images:", len(df_img))
+            print("Final DataFrame shape:", df_img.shape)
+            display(df_img.head())
+        return df_img
+# Function to apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
+def apply_clahe(img):
+    import cv2
+    lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)  # Convert to LAB color space
+    l, a, b = cv2.split(lab)  # Split into channels
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+    cl = clahe.apply(l)  # Apply CLAHE to the L channel
+    limg = cv2.merge((cl, a, b))  # Merge back the channels
+    img_clahe = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB)  # Convert back to RGB
+    return img_clahe

py2ls 0.2.4.14__py3-none-any.whl → 0.2.4.16__py3-none-any.whl

py2ls 0.2.4.14py3-none-any.whl → 0.2.4.16py3-none-any.whl