PyPI - py2ls - Versions diffs - 0.2.4.10.3__py3-none-any.whl → 0.2.4.10.4__py3-none-any.whl - Mend

py2ls 0.2.4.10.3py3-none-any.whl → 0.2.4.10.4py3-none-any.whl

Files changed (5) hide show

py2ls/ips.py +14 -10
py2ls/ml2ls.py +689 -455
{py2ls-0.2.4.10.3.dist-info → py2ls-0.2.4.10.4.dist-info}/METADATA +1 -1
{py2ls-0.2.4.10.3.dist-info → py2ls-0.2.4.10.4.dist-info}/RECORD +5 -5
{py2ls-0.2.4.10.3.dist-info → py2ls-0.2.4.10.4.dist-info}/WHEEL +0 -0

py2ls/ml2ls.py CHANGED Viewed

@@ -4,12 +4,13 @@ from sklearn.ensemble import (
     AdaBoostClassifier,
     BaggingClassifier,
 )
-from sklearn.svm import SVC,SVR
+from sklearn.svm import SVC, SVR
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.model_selection import GridSearchCV, StratifiedKFold
 from sklearn.linear_model import (
     LassoCV,
-    LogisticRegression,LinearRegression,
+    LogisticRegression,
+    LinearRegression,
     Lasso,
     Ridge,
     RidgeClassifierCV,
@@ -47,7 +48,7 @@ from . import plot
 import matplotlib.pyplot as plt
 import seaborn as sns
-plt.style.use("paper")
+plt.style.use(str(get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
 import logging
 import warnings
@@ -334,13 +335,13 @@ def features_naive_bayes(x_train: pd.DataFrame, y_train: pd.Series) -> list:
     probabilities = nb.predict_proba(x_train)
     # Limit the number of features safely, choosing the lesser of half the features or all columns
     n_features = min(x_train.shape[1] // 2, len(x_train.columns))
     # Sort probabilities, then map to valid column indices
     sorted_indices = np.argsort(probabilities.max(axis=1))[:n_features]
     # Ensure indices are within the column bounds of x_train
     valid_indices = sorted_indices[sorted_indices < len(x_train.columns)]
     return x_train.columns[valid_indices]
@@ -575,15 +576,28 @@ def get_features(
     bagging_params: Optional[Dict] = None,
     knn_params: Optional[Dict] = None,
     cls: list = [
-        "lasso","ridge","Elastic Net(Enet)","gradient Boosting","Random forest (rf)","XGBoost (xgb)","Support Vector Machine(svm)",
-        "naive bayes","Linear Discriminant Analysis (lda)","adaboost","DecisionTree","KNeighbors","Bagging"],
+        "lasso",
+        "ridge",
+        "Elastic Net(Enet)",
+        "gradient Boosting",
+        "Random forest (rf)",
+        "XGBoost (xgb)",
+        "Support Vector Machine(svm)",
+        "naive bayes",
+        "Linear Discriminant Analysis (lda)",
+        "adaboost",
+        "DecisionTree",
+        "KNeighbors",
+        "Bagging",
+    ],
     metrics: Optional[List[str]] = None,
     cv_folds: int = 5,
     strict: bool = False,
     n_shared: int = 2,  # 只要有两个方法有重合,就纳入common genes
     use_selected_features: bool = True,
     plot_: bool = True,
-    dir_save:str="./") -> dict:
+    dir_save: str = "./",
+) -> dict:
     """
     Master function to perform feature selection and validate models.
     """
@@ -598,14 +612,14 @@ def get_features(
     # fill na
     if fill_missing:
-        ips.df_fillna(data=X,method='knn',inplace=True,axis=0)
-    if isinstance(y, str) and y in X.columns:
-        y_col_name=y
-        y=X[y]
-        y=ips.df_encoder(pd.DataFrame(y),method='dummy')
-        X = X.drop(y_col_name,axis=1)
+        ips.df_fillna(data=X, method="knn", inplace=True, axis=0)
+    if isinstance(y, str) and y in X.columns:
+        y_col_name = y
+        y = X[y]
+        y = ips.df_encoder(pd.DataFrame(y), method="dummy")
+        X = X.drop(y_col_name, axis=1)
     else:
-        y=ips.df_encoder(pd.DataFrame(y),method='dummy').values.ravel()
+        y = ips.df_encoder(pd.DataFrame(y), method="dummy").values.ravel()
     y = y.loc[X.index]  # Align y with X after dropping rows with missing values in X
     y = y.ravel() if isinstance(y, np.ndarray) else y.values.ravel()
@@ -817,7 +831,7 @@ def get_features(
         top_knn_features,
         strict=strict,
         n_shared=n_shared,
-        verbose=False
+        verbose=False,
     )
     # Use selected features or all features for model validation
@@ -899,13 +913,14 @@ def get_features(
         results = {
             "selected_features": features_df,
             "cv_train_scores": cv_train_results_df,
-            "cv_test_scores": rank_models(cv_test_results_df,plot_=plot_),
+            "cv_test_scores": rank_models(cv_test_results_df, plot_=plot_),
             "common_features": list(common_features),
         }
-        if all([plot_,dir_save]):
+        if all([plot_, dir_save]):
             from datetime import datetime
             now_ = datetime.now().strftime("%y%m%d_%H%M%S")
-            ips.figsave(dir_save+f"features{now_}.pdf")
+            ips.figsave(dir_save + f"features{now_}.pdf")
     else:
         results = {
             "selected_features": pd.DataFrame(),
@@ -931,7 +946,7 @@ def validate_features(
     metrics: Optional[list] = None,
     random_state: int = 1,
     smote: bool = False,
-    n_jobs:int = -1,
+    n_jobs: int = -1,
     plot_: bool = True,
     class_weight: str = "balanced",
 ) -> dict:
@@ -952,8 +967,11 @@ def validate_features(
     """
     from tqdm import tqdm
     # Ensure common features are selected
-    common_features = ips.shared(common_features, x_train.columns, x_true.columns, strict=True,verbose=False)
+    common_features = ips.shared(
+        common_features, x_train.columns, x_true.columns, strict=True, verbose=False
+    )
     # Filter the training and validation datasets for the common features
     x_train_selected = x_train[common_features]
@@ -1007,8 +1025,7 @@ def validate_features(
                 l1_ratio=0.5,
                 random_state=random_state,
             ),
-            "XGBoost": xgb.XGBClassifier(eval_metric="logloss"
-            ),
+            "XGBoost": xgb.XGBClassifier(eval_metric="logloss"),
             "Naive Bayes": GaussianNB(),
             "LDA": LinearDiscriminantAnalysis(),
         }
@@ -1078,11 +1095,11 @@ def validate_features(
     # Validate each classifier with GridSearchCV
     for name, clf in tqdm(
-                models.items(),
-                desc="for metric in metrics",
-                colour="green",
-                bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
-            ):
+        models.items(),
+        desc="for metric in metrics",
+        colour="green",
+        bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
+    ):
         print(f"\nValidating {name} on the validation dataset:")
         # Check if `predict_proba` method exists; if not, use CalibratedClassifierCV
@@ -1162,7 +1179,7 @@ def validate_features(
         if y_pred_proba is not None:
             # fpr, tpr, roc_auc = dict(), dict(), dict()
             fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
-            lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba,verbose=False)
+            lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False)
             roc_auc = auc(fpr, tpr)
             roc_info = {
                 "fpr": fpr.tolist(),
@@ -1197,6 +1214,7 @@ def validate_features(
 # Validate models using the validation dataset (X_val, y_val)
 # validation_results = validate_features(X, y, X_val, y_val, common_features)
 # # If you want to access validation scores
 # print(validation_results)
 def plot_validate_features(res_val):
@@ -1204,47 +1222,75 @@ def plot_validate_features(res_val):
     plot the results of 'validate_features()'
     """
     colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
-    if res_val.shape[0]>5:
-        alpha=0
-        figsize=[8,10]
-        subplot_layout=[1,2]
-        ncols=2
-        bbox_to_anchor=[1.5,0.6]
+    if res_val.shape[0] > 5:
+        alpha = 0
+        figsize = [8, 10]
+        subplot_layout = [1, 2]
+        ncols = 2
+        bbox_to_anchor = [1.5, 0.6]
     else:
-        alpha=0.03
-        figsize=[10,6]
-        subplot_layout=[1,1]
-        ncols=1
-        bbox_to_anchor=[1,1]
+        alpha = 0.03
+        figsize = [10, 6]
+        subplot_layout = [1, 1]
+        ncols = 1
+        bbox_to_anchor = [1, 1]
     nexttile = plot.subplot(figsize=figsize)
-    ax = nexttile(subplot_layout[0],subplot_layout[1])
+    ax = nexttile(subplot_layout[0], subplot_layout[1])
     for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
         fpr = res_val["roc_curve"][model_name]["fpr"]
         tpr = res_val["roc_curve"][model_name]["tpr"]
         (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
         mean_auc = res_val["roc_curve"][model_name]["auc"]
         plot_roc_curve(
-            fpr,tpr,mean_auc,lower_ci,upper_ci,model_name=model_name,
-            lw=1.5,color=colors[i],alpha=alpha,ax=ax)
-    plot.figsets(sp=2,legend=dict(loc="upper right", ncols=ncols, fontsize=8, bbox_to_anchor=[1.5,0.6],markerscale=0.8))
+            fpr,
+            tpr,
+            mean_auc,
+            lower_ci,
+            upper_ci,
+            model_name=model_name,
+            lw=1.5,
+            color=colors[i],
+            alpha=alpha,
+            ax=ax,
+        )
+    plot.figsets(
+        sp=2,
+        legend=dict(
+            loc="upper right",
+            ncols=ncols,
+            fontsize=8,
+            bbox_to_anchor=[1.5, 0.6],
+            markerscale=0.8,
+        ),
+    )
     # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
-    ax = nexttile(subplot_layout[0],subplot_layout[1])
+    ax = nexttile(subplot_layout[0], subplot_layout[1])
     for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
         plot_pr_curve(
             recall=res_val["pr_curve"][model_name]["recall"],
             precision=res_val["pr_curve"][model_name]["precision"],
             avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
             model_name=model_name,
-            color=colors[i],lw=1.5,alpha=alpha,ax=ax)
-    plot.figsets(sp=2,legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5,0.5]))
+            color=colors[i],
+            lw=1.5,
+            alpha=alpha,
+            ax=ax,
+        )
+    plot.figsets(
+        sp=2,
+        legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
+    )
     # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
-def plot_validate_features_single(res_val,figsize=None):
+def plot_validate_features_single(res_val, figsize=None):
     if figsize is None:
         nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3)
     else:
-        nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3,figsize=figsize)
+        nexttile = plot.subplot(
+            len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
+        )
     for model_name in ips.flatten(res_val["pr_curve"].index):
         fpr = res_val["roc_curve"][model_name]["fpr"]
         tpr = res_val["roc_curve"][model_name]["tpr"]
@@ -1268,7 +1314,10 @@ def plot_validate_features_single(res_val,figsize=None):
         plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
         plot.figsets(title=model_name, sp=2)
-def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1,verbose=True):
+def cal_auc_ci(
+    y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1, verbose=True
+):
     y_true = np.asarray(y_true)
     y_pred = np.asarray(y_pred)
     bootstrapped_scores = []
@@ -1298,10 +1347,10 @@ def cal_auc_ci(y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1,verbos
     confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
     if verbose:
         print(
-        "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
-            confidence_lower, confidence_upper
+            "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
+                confidence_lower, confidence_upper
+            )
         )
-    )
     return confidence_lower, confidence_upper
@@ -1339,7 +1388,7 @@ def plot_roc_curve(
     # Plot ROC curve and the diagonal reference line
     ax.fill_between(fpr, tpr, alpha=alpha, color=color)
     ax.plot([0, 1], [0, 1], color=diagonal_color, clip_on=False, linestyle="--")
-    ax.plot(fpr, tpr, color=color, lw=lw, label=label,clip_on=False, **kwargs)
+    ax.plot(fpr, tpr, color=color, lw=lw, label=label, clip_on=False, **kwargs)
     # Setting plot limits, labels, and title
     ax.set_xlim([-0.01, 1.0])
     ax.set_ylim([0.0, 1.0])
@@ -1536,12 +1585,11 @@ def plot_cm(
                     color=color,
                     fontsize=fontsize,
                 )
-    plot.figsets(ax=ax,
-                 boxloc="none"
-                 )
+    plot.figsets(ax=ax, boxloc="none")
     return ax
 def rank_models(
     cv_test_scores,
     rm_outlier=False,
@@ -1644,7 +1692,7 @@ def rank_models(
     if rm_outlier:
         cv_test_scores_ = ips.df_outlier(cv_test_scores)
     else:
-        cv_test_scores_=cv_test_scores
+        cv_test_scores_ = cv_test_scores
     # Normalize the scores of metrics if normalize is True
     scaler = MinMaxScaler()
@@ -1673,7 +1721,7 @@ def rank_models(
             )
             plt.title("Classifier Performance")
             plt.tight_layout()
-            return plt
+            return plt
         nexttile = plot.subplot(2, 2, figsize=[10, 7])
         generate_bar_plot(nexttile(), top_models.dropna())
@@ -1703,10 +1751,11 @@ def rank_models(
 # figsave("classifier_performance.pdf")
 def predict(
     x_train: pd.DataFrame,
     y_train: pd.Series,
-    x_true: pd.DataFrame=None,
+    x_true: pd.DataFrame = None,
     y_true: Optional[pd.Series] = None,
     common_features: set = None,
     purpose: str = "classification",  # 'classification' or 'regression'
@@ -1714,117 +1763,156 @@ def predict(
     metrics: Optional[List[str]] = None,
     random_state: int = 1,
     smote: bool = False,
-    n_jobs:int  = -1,
+    n_jobs: int = -1,
     plot_: bool = True,
-    dir_save:str="./",
-    test_size:float=0.2,# specific only when x_true is None
-    cv_folds:int=5,# more cv_folds 得更加稳定,auc可能更低
-    cv_level:str="l",#"s":'low',"m":'medium',"l":"high"
+    dir_save: str = "./",
+    test_size: float = 0.2,  # specific only when x_true is None
+    cv_folds: int = 5,  # more cv_folds 得更加稳定,auc可能更低
+    cv_level: str = "l",  # "s":'low',"m":'medium',"l":"high"
     class_weight: str = "balanced",
-    verbose:bool=False,
+    verbose: bool = False,
 ) -> pd.DataFrame:
-    """
-        第一种情况是内部拆分，第二种是直接预测，第三种是外部验证。
-        Usage:
-            (1). predict(x_train, y_train,...) 对 x_train 进行拆分训练/测试集，并在测试集上进行验证.
-                predict 函数会根据 test_size 参数，将 x_train 和 y_train 拆分出内部测试集。然后模型会在拆分出的训练集上进行训练，并在测试集上验证效果。
-            (2). predict(x_train, y_train, x_true,...)使用 x_train 和 y_train 训练并对 x_true 进行预测
-                由于传入了 x_true，函数会跳过 x_train 的拆分，直接使用全部的 x_train 和 y_train 进行训练。然后对 x_true 进行预测，但由于没有提供 y_true，
-                因此无法与真实值进行对比。
-            (3). predict(x_train, y_train, x_true, y_true,...)使用 x_train 和 y_train 训练，并验证 x_true 与真实标签 y_true.
-                predict 函数会在 x_train 和 y_train 上进行训练，并将 x_true 作为测试集。由于提供了 y_true，函数可以将预测结果与 y_true 进行对比，从而
-                计算验证指标，完成对 x_true 的真正验证。
-        trains and validates a variety of machine learning models for both classification and regression tasks.
-        It supports hyperparameter tuning with grid search and includes additional features like cross-validation,
-        feature scaling, and handling of class imbalance through SMOTE.
-        Parameters:
-            - x_train (pd.DataFrame):Training feature data, structured with each row as an observation and each column as a feature.
-            - y_train (pd.Series):Target variable for the training dataset.
-            - x_true (pd.DataFrame, optional):Test feature data. If not provided, the function splits x_train based on test_size.
-            - y_true (pd.Series, optional):Test target values. If not provided, y_train is split into training and testing sets.
-            - common_features (set, optional):Specifies a subset of features common across training and test data.
-            - purpose (str, default = "classification"):Defines whether the task is "classification" or "regression". Determines which
-                metrics and models are applied.
-            - cls (dict, optional):Dictionary to specify custom classifiers/regressors. Defaults to a set of common models if not provided.
-            - metrics (list, optional):List of evaluation metrics (like accuracy, F1 score) used for model evaluation.
-            - random_state (int, default = 1):Random seed to ensure reproducibility.
-            - smote (bool, default = False):Applies Synthetic Minority Oversampling Technique (SMOTE) to address class imbalance if enabled.
-            - n_jobs (int, default = -1):Number of parallel jobs for computation. Set to -1 to use all available cores.
-            - plot_ (bool, default = True):If True, generates plots of the model evaluation metrics.
-            - test_size (float, default = 0.2):Test data proportion if x_true is not provided.
-            - cv_folds (int, default = 5):Number of cross-validation folds.
-            - cv_level (str, default = "l"):Sets the detail level of cross-validation. "s" for low, "m" for medium, and "l" for high.
-            - class_weight (str, default = "balanced"):Balances class weights in classification tasks.
-            - verbose (bool, default = False):If True, prints detailed output during model training.
-            - dir_save (str, default = "./"):Directory path to save plot outputs and results.
-        Key Steps in the Function:
-            Model Initialization: Depending on purpose, initializes either classification or regression models.
-            Feature Selection: Ensures training and test sets have matching feature columns.
-            SMOTE Application: Balances classes if smote is enabled and the task is classification.
-            Cross-Validation and Hyperparameter Tuning: Utilizes GridSearchCV for model tuning based on cv_level.
-            Evaluation and Plotting: Outputs evaluation metrics like AUC, confusion matrices, and optional plotting of performance metrics.
+    """
+    第一种情况是内部拆分，第二种是直接预测，第三种是外部验证。
+    Usage:
+        (1). predict(x_train, y_train,...) 对 x_train 进行拆分训练/测试集，并在测试集上进行验证.
+            predict 函数会根据 test_size 参数，将 x_train 和 y_train 拆分出内部测试集。然后模型会在拆分出的训练集上进行训练，并在测试集上验证效果。
+        (2). predict(x_train, y_train, x_true,...)使用 x_train 和 y_train 训练并对 x_true 进行预测
+            由于传入了 x_true，函数会跳过 x_train 的拆分，直接使用全部的 x_train 和 y_train 进行训练。然后对 x_true 进行预测，但由于没有提供 y_true，
+            因此无法与真实值进行对比。
+        (3). predict(x_train, y_train, x_true, y_true,...)使用 x_train 和 y_train 训练，并验证 x_true 与真实标签 y_true.
+            predict 函数会在 x_train 和 y_train 上进行训练，并将 x_true 作为测试集。由于提供了 y_true，函数可以将预测结果与 y_true 进行对比，从而
+            计算验证指标，完成对 x_true 的真正验证。
+    trains and validates a variety of machine learning models for both classification and regression tasks.
+    It supports hyperparameter tuning with grid search and includes additional features like cross-validation,
+    feature scaling, and handling of class imbalance through SMOTE.
+    Parameters:
+        - x_train (pd.DataFrame):Training feature data, structured with each row as an observation and each column as a feature.
+        - y_train (pd.Series):Target variable for the training dataset.
+        - x_true (pd.DataFrame, optional):Test feature data. If not provided, the function splits x_train based on test_size.
+        - y_true (pd.Series, optional):Test target values. If not provided, y_train is split into training and testing sets.
+        - common_features (set, optional):Specifies a subset of features common across training and test data.
+        - purpose (str, default = "classification"):Defines whether the task is "classification" or "regression". Determines which
+            metrics and models are applied.
+        - cls (dict, optional):Dictionary to specify custom classifiers/regressors. Defaults to a set of common models if not provided.
+        - metrics (list, optional):List of evaluation metrics (like accuracy, F1 score) used for model evaluation.
+        - random_state (int, default = 1):Random seed to ensure reproducibility.
+        - smote (bool, default = False):Applies Synthetic Minority Oversampling Technique (SMOTE) to address class imbalance if enabled.
+        - n_jobs (int, default = -1):Number of parallel jobs for computation. Set to -1 to use all available cores.
+        - plot_ (bool, default = True):If True, generates plots of the model evaluation metrics.
+        - test_size (float, default = 0.2):Test data proportion if x_true is not provided.
+        - cv_folds (int, default = 5):Number of cross-validation folds.
+        - cv_level (str, default = "l"):Sets the detail level of cross-validation. "s" for low, "m" for medium, and "l" for high.
+        - class_weight (str, default = "balanced"):Balances class weights in classification tasks.
+        - verbose (bool, default = False):If True, prints detailed output during model training.
+        - dir_save (str, default = "./"):Directory path to save plot outputs and results.
+    Key Steps in the Function:
+        Model Initialization: Depending on purpose, initializes either classification or regression models.
+        Feature Selection: Ensures training and test sets have matching feature columns.
+        SMOTE Application: Balances classes if smote is enabled and the task is classification.
+        Cross-Validation and Hyperparameter Tuning: Utilizes GridSearchCV for model tuning based on cv_level.
+        Evaluation and Plotting: Outputs evaluation metrics like AUC, confusion matrices, and optional plotting of performance metrics.
     """
     from tqdm import tqdm
-    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor
+    from sklearn.ensemble import (
+        RandomForestClassifier,
+        RandomForestRegressor,
+        ExtraTreesClassifier,
+        ExtraTreesRegressor,
+        BaggingClassifier,
+        BaggingRegressor,
+        AdaBoostClassifier,
+        AdaBoostRegressor,
+    )
     from sklearn.svm import SVC, SVR
     from sklearn.tree import DecisionTreeRegressor
-    from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV, LinearRegression, Lasso,RidgeClassifierCV, Perceptron, SGDClassifier
-    from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
-    from sklearn.naive_bayes import GaussianNB,BernoulliNB
+    from sklearn.linear_model import (
+        LogisticRegression,
+        ElasticNet,
+        ElasticNetCV,
+        LinearRegression,
+        Lasso,
+        RidgeClassifierCV,
+        Perceptron,
+        SGDClassifier,
+    )
+    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+    from sklearn.naive_bayes import GaussianNB, BernoulliNB
     from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
     import xgboost as xgb
     import lightgbm as lgb
     import catboost as cb
     from sklearn.neural_network import MLPClassifier, MLPRegressor
     from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
-    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
+    from sklearn.discriminant_analysis import (
+        LinearDiscriminantAnalysis,
+        QuadraticDiscriminantAnalysis,
+    )
     from sklearn.preprocessing import PolynomialFeatures
     # 拼写检查
-    purpose=ips.strcmp(purpose,['classification','regression'])[0]
+    purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
     print(f"{purpose} processing...")
     # Default models or regressors if not provided
     if purpose == "classification":
         model_ = {
-            "Random Forest": RandomForestClassifier(random_state=random_state, class_weight=class_weight),
-            # SVC (Support Vector Classification)
-            "SVM": SVC(kernel="rbf",probability=True,class_weight=class_weight,random_state=random_state),
+            "Random Forest": RandomForestClassifier(
+                random_state=random_state, class_weight=class_weight
+            ),
+            # SVC (Support Vector Classification)
+            "SVM": SVC(
+                kernel="rbf",
+                probability=True,
+                class_weight=class_weight,
+                random_state=random_state,
+            ),
             # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
-            "Logistic Regression": LogisticRegression(class_weight=class_weight, random_state=random_state),
+            "Logistic Regression": LogisticRegression(
+                class_weight=class_weight, random_state=random_state
+            ),
             # Logistic Regression with L1 Regularization (Lasso)
-            "Lasso Logistic Regression": LogisticRegression(penalty="l1", solver="saga", random_state=random_state),
+            "Lasso Logistic Regression": LogisticRegression(
+                penalty="l1", solver="saga", random_state=random_state
+            ),
             "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
-            "XGBoost": xgb.XGBClassifier(eval_metric="logloss",random_state=random_state,),
+            "XGBoost": xgb.XGBClassifier(
+                eval_metric="logloss",
+                random_state=random_state,
+            ),
             "KNN": KNeighborsClassifier(n_neighbors=5),
             "Naive Bayes": GaussianNB(),
             "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
-            "AdaBoost":  AdaBoostClassifier(algorithm='SAMME', random_state=random_state),
+            "AdaBoost": AdaBoostClassifier(
+                algorithm="SAMME", random_state=random_state
+            ),
             # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
             "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
-            "Extra Trees": ExtraTreesClassifier(random_state=random_state, class_weight=class_weight),
+            "Extra Trees": ExtraTreesClassifier(
+                random_state=random_state, class_weight=class_weight
+            ),
             "Bagging": BaggingClassifier(random_state=random_state),
             "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
             "DecisionTree": DecisionTreeClassifier(),
             "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
-            "Ridge": RidgeClassifierCV(class_weight=class_weight, store_cv_results=True),
+            "Ridge": RidgeClassifierCV(
+                class_weight=class_weight, store_cv_results=True
+            ),
             "Perceptron": Perceptron(random_state=random_state),
             "Bernoulli Naive Bayes": BernoulliNB(),
-            "SGDClassifier": SGDClassifier(random_state=random_state),
+            "SGDClassifier": SGDClassifier(random_state=random_state),
         }
     elif purpose == "regression":
         model_ = {
             "Random Forest": RandomForestRegressor(random_state=random_state),
-            "SVM": SVR(),# SVR (Support Vector Regression)
+            "SVM": SVR(),  # SVR (Support Vector Regression)
             # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
-            "LassoCV": LassoCV(cv=cv_folds, random_state=random_state),#LassoCV自动找出最适alpha,优于Lasso
+            "LassoCV": LassoCV(
+                cv=cv_folds, random_state=random_state
+            ),  # LassoCV自动找出最适alpha,优于Lasso
             "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
-            "XGBoost": xgb.XGBRegressor(eval_metric="rmse",random_state=random_state),
+            "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
             "Linear Regression": LinearRegression(),
             "Lasso": Lasso(random_state=random_state),
             "AdaBoost": AdaBoostRegressor(random_state=random_state),
@@ -1834,71 +1922,76 @@ def predict(
             "Bagging": BaggingRegressor(random_state=random_state),
             "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
             "ElasticNet": ElasticNet(random_state=random_state),
-            "Ridge": Ridge(),
-            "KNN":KNeighborsRegressor()
+            "Ridge": Ridge(),
+            "KNN": KNeighborsRegressor(),
         }
-    # indicate cls:
-    if ips.run_once_within(30):# 10 min
+    # indicate cls:
+    if ips.run_once_within(30):  # 10 min
         print(f"supported models: {list(model_.keys())}")
     if cls is None:
-        models=model_
+        models = model_
     else:
         if not isinstance(cls, list):
-            cls=[cls]
-        models={}
-        for cls_ in cls:
+            cls = [cls]
+        models = {}
+        for cls_ in cls:
             cls_ = ips.strcmp(cls_, list(model_.keys()))[0]
             models[cls_] = model_[cls_]
-    if 'LightGBM' in models:
-        x_train=ips.df_special_characters_cleaner(x_train)
-        x_true=ips.df_special_characters_cleaner(x_true) if x_true is not None else None
-    if isinstance(y_train, str) and y_train in x_train.columns:
-        y_train_col_name=y_train
-        y_train=x_train[y_train]
-        y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy')
-        x_train = x_train.drop(y_train_col_name,axis=1)
+    if "LightGBM" in models:
+        x_train = ips.df_special_characters_cleaner(x_train)
+        x_true = (
+            ips.df_special_characters_cleaner(x_true) if x_true is not None else None
+        )
+    if isinstance(y_train, str) and y_train in x_train.columns:
+        y_train_col_name = y_train
+        y_train = x_train[y_train]
+        y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
+        x_train = x_train.drop(y_train_col_name, axis=1)
     else:
-        y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy').values.ravel()
+        y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
     if x_true is None:
         x_train, x_true, y_train, y_true = train_test_split(
-            x_train,
-            y_train,
-            test_size=test_size,
-            random_state=random_state,
-            stratify=y_train if purpose == "classification" else None
+            x_train,
+            y_train,
+            test_size=test_size,
+            random_state=random_state,
+            stratify=y_train if purpose == "classification" else None,
         )
-        if isinstance(y_train, str) and y_train in x_train.columns:
-            y_train_col_name=y_train
-            y_train=x_train[y_train]
-            y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy')
-            x_train = x_train.drop(y_train_col_name,axis=1)
+        if isinstance(y_train, str) and y_train in x_train.columns:
+            y_train_col_name = y_train
+            y_train = x_train[y_train]
+            y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
+            x_train = x_train.drop(y_train_col_name, axis=1)
         else:
-            y_train=ips.df_encoder(pd.DataFrame(y_train),method='dummy').values.ravel()
+            y_train = ips.df_encoder(
+                pd.DataFrame(y_train), method="dummy"
+            ).values.ravel()
     if y_true is not None:
-        if isinstance(y_true, str) and y_true in x_true.columns:
-            y_true_col_name=y_true
-            y_true=x_true[y_true]
-            y_true=ips.df_encoder(pd.DataFrame(y_true),method='dummy')
-            x_true = x_true.drop(y_true_col_name,axis=1)
+        if isinstance(y_true, str) and y_true in x_true.columns:
+            y_true_col_name = y_true
+            y_true = x_true[y_true]
+            y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy")
+            x_true = x_true.drop(y_true_col_name, axis=1)
         else:
-            y_true=ips.df_encoder(pd.DataFrame(y_true),method='dummy').values.ravel()
+            y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy").values.ravel()
     # to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
     # y_train=y_train.values.ravel() if y_train is not None else None
     # y_true=y_true.values.ravel() if y_true is not None else None
-    y_train = y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
+    y_train = (
+        y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
+    )
     y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
     # Ensure common features are selected
     if common_features is not None:
         x_train, x_true = x_train[common_features], x_true[common_features]
     else:
-        share_col_names = ips.shared(x_train.columns, x_true.columns,verbose=verbose)
-        x_train, x_true =x_train[share_col_names], x_true[share_col_names]
+        share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
+        x_train, x_true = x_train[share_col_names], x_true[share_col_names]
     x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
     x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
@@ -1917,26 +2010,30 @@ def predict(
         x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
     # Hyperparameter grids for tuning
-    if cv_level in ["low",'simple','s','l']:
+    if cv_level in ["low", "simple", "s", "l"]:
         param_grids = {
-            "Random Forest": {
-                "n_estimators": [100],  # One basic option
-                "max_depth": [None, 10],
-                "min_samples_split": [2],
-                "min_samples_leaf": [1],
-                "class_weight": [None],
-            } if purpose == "classification" else {
-                "n_estimators": [100],  # One basic option
-                "max_depth": [None, 10],
-                "min_samples_split": [2],
-                "min_samples_leaf": [1],
-                "max_features": [None],
-                "bootstrap": [True],       # Only one option for simplicity
-            },
+            "Random Forest": (
+                {
+                    "n_estimators": [100],  # One basic option
+                    "max_depth": [None, 10],
+                    "min_samples_split": [2],
+                    "min_samples_leaf": [1],
+                    "class_weight": [None],
+                }
+                if purpose == "classification"
+                else {
+                    "n_estimators": [100],  # One basic option
+                    "max_depth": [None, 10],
+                    "min_samples_split": [2],
+                    "min_samples_leaf": [1],
+                    "max_features": [None],
+                    "bootstrap": [True],  # Only one option for simplicity
+                }
+            ),
             "SVM": {
                 "C": [1],
-                "gamma": ['scale'],
-                "kernel": ['rbf'],
+                "gamma": ["scale"],
+                "kernel": ["rbf"],
             },
             "Lasso": {
                 "alpha": [0.1],
@@ -1946,8 +2043,8 @@ def predict(
             },
             "Logistic Regression": {
                 "C": [1],
-                "solver": ['lbfgs'],
-                "penalty": ['l2'],
+                "solver": ["lbfgs"],
+                "penalty": ["l2"],
                 "max_iter": [500],
             },
             "Gradient Boosting": {
@@ -1964,25 +2061,29 @@ def predict(
                 "subsample": [0.8],
                 "colsample_bytree": [0.8],
             },
-            "KNN": {
-                "n_neighbors": [3],
-                "weights": ['uniform'],
-                "algorithm": ['auto'],
-                "p": [2],
-            } if purpose == 'classification' else {
-                'n_neighbors': [3],
-                'weights': ['uniform'],
-                'metric': ['euclidean'],
-                'leaf_size': [30],
-                'p': [2],
-            },
+            "KNN": (
+                {
+                    "n_neighbors": [3],
+                    "weights": ["uniform"],
+                    "algorithm": ["auto"],
+                    "p": [2],
+                }
+                if purpose == "classification"
+                else {
+                    "n_neighbors": [3],
+                    "weights": ["uniform"],
+                    "metric": ["euclidean"],
+                    "leaf_size": [30],
+                    "p": [2],
+                }
+            ),
             "Naive Bayes": {
                 "var_smoothing": [1e-9],
             },
             "SVR": {
                 "C": [1],
-                "gamma": ['scale'],
-                "kernel": ['rbf'],
+                "gamma": ["scale"],
+                "kernel": ["rbf"],
             },
             "Linear Regression": {
                 "fit_intercept": [True],
@@ -2003,9 +2104,9 @@ def predict(
                 "n_estimators": [100],
                 "num_leaves": [31],
                 "max_depth": [10],
-                'min_data_in_leaf': [20],
-                'min_gain_to_split': [0.01],
-                'scale_pos_weight': [10],
+                "min_data_in_leaf": [20],
+                "min_gain_to_split": [0.01],
+                "scale_pos_weight": [10],
             },
             "Bagging": {
                 "n_estimators": [50],
@@ -2033,132 +2134,168 @@ def predict(
                 "shrinkage": [None],
             },
             "Quadratic Discriminant Analysis": {
-                'reg_param': [0.0],
-                'priors': [None],
-                'tol': [1e-4],
-            },
-            "Ridge": {'class_weight': [None, 'balanced']} if purpose == "classification" else {
-                'alpha': [0.1, 1, 10],
+                "reg_param": [0.0],
+                "priors": [None],
+                "tol": [1e-4],
             },
+            "Ridge": (
+                {"class_weight": [None, "balanced"]}
+                if purpose == "classification"
+                else {
+                    "alpha": [0.1, 1, 10],
+                }
+            ),
             "Perceptron": {
-                'alpha': [1e-3],
-                'penalty': ['l2'],
-                'max_iter': [1000],
-                'eta0': [1.0],
+                "alpha": [1e-3],
+                "penalty": ["l2"],
+                "max_iter": [1000],
+                "eta0": [1.0],
             },
             "Bernoulli Naive Bayes": {
-                'alpha': [0.1, 1, 10],
-                'binarize': [0.0],
-                'fit_prior': [True],
+                "alpha": [0.1, 1, 10],
+                "binarize": [0.0],
+                "fit_prior": [True],
             },
             "SGDClassifier": {
-                'eta0': [0.01],
-                'loss': ['hinge'],
-                'penalty': ['l2'],
-                'alpha': [1e-3],
-                'max_iter': [1000],
-                'tol': [1e-3],
-                'random_state': [random_state],
-                'learning_rate': ['constant'],
+                "eta0": [0.01],
+                "loss": ["hinge"],
+                "penalty": ["l2"],
+                "alpha": [1e-3],
+                "max_iter": [1000],
+                "tol": [1e-3],
+                "random_state": [random_state],
+                "learning_rate": ["constant"],
             },
         }
-    elif cv_level in ['high','advanced','h']:
-        param_grids = {
-            "Random Forest": {
-                            "n_estimators": [100, 200, 500, 700, 1000],
-                            "max_depth": [None, 3, 5, 10, 15, 20, 30],
-                            "min_samples_split": [2, 5, 10, 20],
-                            "min_samples_leaf": [1, 2, 4],
-                            "class_weight": [None, "balanced"] if purpose == "classification" else {},
-                        } if purpose == "classification" else {
-                            "n_estimators": [100, 200, 500, 700, 1000],
-                            "max_depth": [None, 3, 5, 10, 15, 20, 30],
-                            "min_samples_split": [2, 5, 10, 20],
-                            "min_samples_leaf": [1, 2, 4],
-                            "max_features": ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
-                            "bootstrap": [True, False],             # Whether bootstrap samples are used when building trees
-                            },
+    elif cv_level in ["high", "advanced", "h"]:
+        param_grids = {
+            "Random Forest": (
+                {
+                    "n_estimators": [100, 200, 500, 700, 1000],
+                    "max_depth": [None, 3, 5, 10, 15, 20, 30],
+                    "min_samples_split": [2, 5, 10, 20],
+                    "min_samples_leaf": [1, 2, 4],
+                    "class_weight": (
+                        [None, "balanced"] if purpose == "classification" else {}
+                    ),
+                }
+                if purpose == "classification"
+                else {
+                    "n_estimators": [100, 200, 500, 700, 1000],
+                    "max_depth": [None, 3, 5, 10, 15, 20, 30],
+                    "min_samples_split": [2, 5, 10, 20],
+                    "min_samples_leaf": [1, 2, 4],
+                    "max_features": [
+                        "auto",
+                        "sqrt",
+                        "log2",
+                    ],  # Number of features to consider when looking for the best split
+                    "bootstrap": [
+                        True,
+                        False,
+                    ],  # Whether bootstrap samples are used when building trees
+                }
+            ),
             "SVM": {
-                    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-                    "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
-                    "kernel": ["linear", "rbf", "poly"],
-                },
+                "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+                "gamma": ["scale", "auto", 0.001, 0.01, 0.1],
+                "kernel": ["linear", "rbf", "poly"],
+            },
             "Logistic Regression": {
-                                    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-                                    "solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
-                                    "penalty": ["l1", "l2", "elasticnet"],
-                                    "max_iter": [100, 200, 300, 500],
-                },
-            "Lasso":{
-                    "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
-                    "max_iter": [500, 1000, 2000, 5000],
-                    "tol": [1e-4, 1e-5, 1e-6],
-                    "selection": ["cyclic", "random"]
-                },
-            "LassoCV":{
-                        "alphas": [[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
-                        "max_iter": [500, 1000, 2000, 5000],
-                        "cv": [3, 5, 10],
-                        "tol": [1e-4, 1e-5, 1e-6]
-                    },
+                "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+                "solver": ["liblinear", "saga", "newton-cg", "lbfgs"],
+                "penalty": ["l1", "l2", "elasticnet"],
+                "max_iter": [100, 200, 300, 500],
+            },
+            "Lasso": {
+                "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
+                "max_iter": [500, 1000, 2000, 5000],
+                "tol": [1e-4, 1e-5, 1e-6],
+                "selection": ["cyclic", "random"],
+            },
+            "LassoCV": {
+                "alphas": [[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
+                "max_iter": [500, 1000, 2000, 5000],
+                "cv": [3, 5, 10],
+                "tol": [1e-4, 1e-5, 1e-6],
+            },
             "Gradient Boosting": {
-                                "n_estimators": [100, 200, 300, 400, 500, 700, 1000],
-                                "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3, 0.5],
-                                "max_depth": [3, 5, 7, 9, 15],
-                                "min_samples_split": [2, 5, 10, 20],
-                                "subsample": [0.8, 1.0],
-                },
+                "n_estimators": [100, 200, 300, 400, 500, 700, 1000],
+                "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3, 0.5],
+                "max_depth": [3, 5, 7, 9, 15],
+                "min_samples_split": [2, 5, 10, 20],
+                "subsample": [0.8, 1.0],
+            },
             "XGBoost": {
-                        "n_estimators": [100, 200, 500, 700],
-                        "max_depth": [3, 5, 7, 10],
-                        "learning_rate": [0.01, 0.1, 0.2, 0.3],
-                        "subsample": [0.8, 1.0],
-                        "colsample_bytree": [0.8, 0.9, 1.0],
-                },
-            "KNN": {
-                "n_neighbors": [1, 3, 5, 10, 15, 20],
-                "weights": ["uniform", "distance"],
-                "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
-                "p": [1, 2],  # 1 for Manhattan, 2 for Euclidean distance
-                } if purpose=='classification' else {
-                    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
-                    'weights': ['uniform', 'distance'],  # Weight function used in prediction
-                    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metric
-                    'leaf_size': [20, 30, 40, 50],  # Leaf size for KDTree or BallTree algorithms
-                    'p': [1, 2]  # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
-                    },
+                "n_estimators": [100, 200, 500, 700],
+                "max_depth": [3, 5, 7, 10],
+                "learning_rate": [0.01, 0.1, 0.2, 0.3],
+                "subsample": [0.8, 1.0],
+                "colsample_bytree": [0.8, 0.9, 1.0],
+            },
+            "KNN": (
+                {
+                    "n_neighbors": [1, 3, 5, 10, 15, 20],
+                    "weights": ["uniform", "distance"],
+                    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
+                    "p": [1, 2],  # 1 for Manhattan, 2 for Euclidean distance
+                }
+                if purpose == "classification"
+                else {
+                    "n_neighbors": [3, 5, 7, 9, 11],  # Number of neighbors
+                    "weights": [
+                        "uniform",
+                        "distance",
+                    ],  # Weight function used in prediction
+                    "metric": [
+                        "euclidean",
+                        "manhattan",
+                        "minkowski",
+                    ],  # Distance metric
+                    "leaf_size": [
+                        20,
+                        30,
+                        40,
+                        50,
+                    ],  # Leaf size for KDTree or BallTree algorithms
+                    "p": [
+                        1,
+                        2,
+                    ],  # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
+                }
+            ),
             "Naive Bayes": {
                 "var_smoothing": [1e-10, 1e-9, 1e-8, 1e-7],
-                },
+            },
             "AdaBoost": {
-                        "n_estimators": [50, 100, 200, 300, 500],
-                        "learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
-                },
+                "n_estimators": [50, 100, 200, 300, 500],
+                "learning_rate": [0.001, 0.01, 0.1, 0.5, 1.0],
+            },
             "SVR": {
-                    "C": [0.01, 0.1, 1, 10, 100, 1000],
-                    "gamma": [0.001, 0.01, 0.1, "scale", "auto"],
-                    "kernel": ["linear", "rbf", "poly"],
-                },
+                "C": [0.01, 0.1, 1, 10, 100, 1000],
+                "gamma": [0.001, 0.01, 0.1, "scale", "auto"],
+                "kernel": ["linear", "rbf", "poly"],
+            },
             "Linear Regression": {
                 "fit_intercept": [True, False],
-                },
-            "Lasso":{
+            },
+            "Lasso": {
                 "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
-                "max_iter": [1000, 2000]  # Higher iteration limit for fine-tuning
-                },
+                "max_iter": [1000, 2000],  # Higher iteration limit for fine-tuning
+            },
             "Extra Trees": {
                 "n_estimators": [100, 200, 500, 700, 1000],
                 "max_depth": [None, 5, 10, 15, 20, 30],
                 "min_samples_split": [2, 5, 10, 20],
-                "min_samples_leaf": [1, 2, 4]
-                },
+                "min_samples_leaf": [1, 2, 4],
+            },
             "CatBoost": {
                 "iterations": [100, 200, 500],
                 "learning_rate": [0.001, 0.01, 0.1, 0.2],
                 "depth": [3, 5, 7, 10],
                 "l2_leaf_reg": [1, 3, 5, 7, 10],
                 "border_count": [32, 64, 128],
-                },
+            },
             "LightGBM": {
                 "n_estimators": [100, 200, 500, 700, 1000],
                 "learning_rate": [0.001, 0.01, 0.1, 0.2],
@@ -2167,66 +2304,97 @@ def predict(
                 "min_child_samples": [5, 10, 20],
                 "subsample": [0.8, 1.0],
                 "colsample_bytree": [0.8, 0.9, 1.0],
-                },
+            },
             "Neural Network": {
                 "hidden_layer_sizes": [(50,), (100,), (100, 50), (200, 100)],
                 "activation": ["relu", "tanh", "logistic"],
                 "solver": ["adam", "sgd", "lbfgs"],
                 "alpha": [0.0001, 0.001, 0.01],
                 "learning_rate": ["constant", "adaptive"],
-                },
+            },
             "Decision Tree": {
                 "max_depth": [None, 5, 10, 20, 30],
                 "min_samples_split": [2, 5, 10, 20],
                 "min_samples_leaf": [1, 2, 5, 10],
                 "criterion": ["gini", "entropy"],
                 "splitter": ["best", "random"],
-                },
+            },
             "Linear Discriminant Analysis": {
                 "solver": ["svd", "lsqr", "eigen"],
-                "shrinkage": [None, "auto", 0.1, 0.5, 1.0],  # shrinkage levels for 'lsqr' and 'eigen'
-                },
-            'Ridge': {'class_weight': [None, 'balanced']} if purpose == "classification" else {
-                'alpha': [0.1, 1, 10, 100, 1000],
-                'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'lbfgs'],
-                'fit_intercept': [True, False],  # Whether to calculate the intercept
-                'normalize': [True, False]  # If True, the regressors X will be normalized
+                "shrinkage": [
+                    None,
+                    "auto",
+                    0.1,
+                    0.5,
+                    1.0,
+                ],  # shrinkage levels for 'lsqr' and 'eigen'
+            },
+            "Ridge": (
+                {"class_weight": [None, "balanced"]}
+                if purpose == "classification"
+                else {
+                    "alpha": [0.1, 1, 10, 100, 1000],
+                    "solver": ["auto", "svd", "cholesky", "lsqr", "lbfgs"],
+                    "fit_intercept": [
+                        True,
+                        False,
+                    ],  # Whether to calculate the intercept
+                    "normalize": [
+                        True,
+                        False,
+                    ],  # If True, the regressors X will be normalized
                 }
-            }
-    else: # median level
-        param_grids = {
-            "Random Forest": {
-                "n_estimators": [100, 200, 500],
-                "max_depth": [None, 10, 20, 30],
-                "min_samples_split": [2, 5, 10],
-                "min_samples_leaf": [1, 2, 4],
-                "class_weight": [None, "balanced"]
-                } if purpose == "classification" else {
-                        "n_estimators": [100, 200, 500],
-                        "max_depth": [None, 10, 20, 30],
-                        "min_samples_split": [2, 5, 10],
-                        "min_samples_leaf": [1, 2, 4],
-                        "max_features": ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
-                        "bootstrap": [True, False],             # Whether bootstrap samples are used when building trees
-                        },
+            ),
+        }
+    else:  # median level
+        param_grids = {
+            "Random Forest": (
+                {
+                    "n_estimators": [100, 200, 500],
+                    "max_depth": [None, 10, 20, 30],
+                    "min_samples_split": [2, 5, 10],
+                    "min_samples_leaf": [1, 2, 4],
+                    "class_weight": [None, "balanced"],
+                }
+                if purpose == "classification"
+                else {
+                    "n_estimators": [100, 200, 500],
+                    "max_depth": [None, 10, 20, 30],
+                    "min_samples_split": [2, 5, 10],
+                    "min_samples_leaf": [1, 2, 4],
+                    "max_features": [
+                        "auto",
+                        "sqrt",
+                        "log2",
+                    ],  # Number of features to consider when looking for the best split
+                    "bootstrap": [
+                        True,
+                        False,
+                    ],  # Whether bootstrap samples are used when building trees
+                }
+            ),
             "SVM": {
                 "C": [0.1, 1, 10, 100],  # Regularization strength
-                "gamma": ['scale', 'auto'],  # Common gamma values
-                "kernel": ['rbf', 'linear', 'poly'],
+                "gamma": ["scale", "auto"],  # Common gamma values
+                "kernel": ["rbf", "linear", "poly"],
             },
             "Logistic Regression": {
                 "C": [0.1, 1, 10, 100],  # Regularization strength
-                "solver": ['lbfgs', 'liblinear', 'saga'],  # Common solvers
-                "penalty": ['l2'],  # L2 penalty is most common
-                "max_iter": [500, 1000, 2000],  # Increased max_iter for better convergence
+                "solver": ["lbfgs", "liblinear", "saga"],  # Common solvers
+                "penalty": ["l2"],  # L2 penalty is most common
+                "max_iter": [
+                    500,
+                    1000,
+                    2000,
+                ],  # Increased max_iter for better convergence
             },
-            "Lasso":{
+            "Lasso": {
                 "alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
-                "max_iter": [500, 1000, 2000]
+                "max_iter": [500, 1000, 2000],
             },
-            "LassoCV":{
+            "LassoCV": {
                 "alphas": [[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]],
-                "max_iter": [500, 1000, 2000]
+                "max_iter": [500, 1000, 2000],
             },
             "Gradient Boosting": {
                 "n_estimators": [100, 200, 500],
@@ -2242,25 +2410,44 @@ def predict(
                 "subsample": [0.8, 1.0],
                 "colsample_bytree": [0.8, 1.0],
             },
-            "KNN": {
-                "n_neighbors": [3, 5, 7, 10],
-                "weights": ['uniform', 'distance'],
-                "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
-                "p": [1, 2],
-                } if purpose=='classification' else {
-                    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors
-                    'weights': ['uniform', 'distance'],  # Weight function used in prediction
-                    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metric
-                    'leaf_size': [20, 30, 40, 50],  # Leaf size for KDTree or BallTree algorithms
-                    'p': [1, 2]  # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
-                    },
+            "KNN": (
+                {
+                    "n_neighbors": [3, 5, 7, 10],
+                    "weights": ["uniform", "distance"],
+                    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
+                    "p": [1, 2],
+                }
+                if purpose == "classification"
+                else {
+                    "n_neighbors": [3, 5, 7, 9, 11],  # Number of neighbors
+                    "weights": [
+                        "uniform",
+                        "distance",
+                    ],  # Weight function used in prediction
+                    "metric": [
+                        "euclidean",
+                        "manhattan",
+                        "minkowski",
+                    ],  # Distance metric
+                    "leaf_size": [
+                        20,
+                        30,
+                        40,
+                        50,
+                    ],  # Leaf size for KDTree or BallTree algorithms
+                    "p": [
+                        1,
+                        2,
+                    ],  # Power parameter for the Minkowski metric (1 = Manhattan, 2 = Euclidean)
+                }
+            ),
             "Naive Bayes": {
                 "var_smoothing": [1e-9, 1e-8, 1e-7],
             },
             "SVR": {
                 "C": [0.1, 1, 10, 100],
-                "gamma": ['scale', 'auto'],
-                "kernel": ['rbf', 'linear'],
+                "gamma": ["scale", "auto"],
+                "kernel": ["rbf", "linear"],
             },
             "Linear Regression": {
                 "fit_intercept": [True, False],
@@ -2286,10 +2473,10 @@ def predict(
                 "learning_rate": [0.01, 0.1],
                 "num_leaves": [31, 50, 100],
                 "max_depth": [-1, 10, 20],
-                'min_data_in_leaf': [20],     # Minimum samples in each leaf
-                'min_gain_to_split': [0.01],  # Minimum gain to allow a split
-                'scale_pos_weight': [10],     # Address class imbalance
-            },
+                "min_data_in_leaf": [20],  # Minimum samples in each leaf
+                "min_gain_to_split": [0.01],  # Minimum gain to allow a split
+                "scale_pos_weight": [10],  # Address class imbalance
+            },
             "Bagging": {
                 "n_estimators": [10, 50, 100],
                 "max_samples": [0.5, 0.7, 1.0],
@@ -2314,41 +2501,73 @@ def predict(
             "Linear Discriminant Analysis": {
                 "solver": ["svd", "lsqr", "eigen"],
                 "shrinkage": [None, "auto"],
-            },            "Quadratic Discriminant Analysis":{
-                'reg_param': [0.0, 0.1, 0.5, 1.0],  # Regularization parameter
-                'priors': [None, [0.5, 0.5], [0.3, 0.7]],  # Class priors
-                'tol': [1e-4, 1e-3, 1e-2]  # Tolerance value for the convergence of the algorithm
-                },
-            "Perceptron":{
-                'alpha': [1e-4, 1e-3, 1e-2],  # Regularization parameter
-                'penalty': ['l2', 'l1', 'elasticnet'],  # Regularization penalty
-                'max_iter': [1000, 2000],  # Maximum number of iterations
-                'eta0': [1.0, 0.1],  # Learning rate for gradient descent
-                'tol': [1e-3, 1e-4, 1e-5],  # Tolerance for stopping criteria
-                'random_state': [random_state]  # Random state for reproducibility
-                },
-            "Bernoulli Naive Bayes":{
-                'alpha': [0.1, 1.0, 10.0],  # Additive (Laplace) smoothing parameter
-                'binarize': [0.0, 0.5, 1.0],  # Threshold for binarizing the input features
-                'fit_prior': [True, False]  # Whether to learn class prior probabilities
-                },
-            "SGDClassifier":{
-                'eta0': [0.01, 0.1, 1.0],
-                'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],  # Loss function
-                'penalty': ['l2', 'l1', 'elasticnet'],  # Regularization penalty
-                'alpha': [1e-4, 1e-3, 1e-2],  # Regularization strength
-                'l1_ratio': [0.15, 0.5, 0.85],  # L1 ratio for elasticnet penalty
-                'max_iter': [1000, 2000],  # Maximum number of iterations
-                'tol': [1e-3, 1e-4],  # Tolerance for stopping criteria
-                'random_state': [random_state],  # Random state for reproducibility
-                'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Learning rate schedule
-                },
-            'Ridge': {'class_weight': [None, 'balanced']} if purpose == "classification" else {
-                'alpha': [0.1, 1, 10, 100],
-                'solver': ['auto', 'svd', 'cholesky', 'lsqr']  # Solver for optimization
-            }
+            },
+            "Quadratic Discriminant Analysis": {
+                "reg_param": [0.0, 0.1, 0.5, 1.0],  # Regularization parameter
+                "priors": [None, [0.5, 0.5], [0.3, 0.7]],  # Class priors
+                "tol": [
+                    1e-4,
+                    1e-3,
+                    1e-2,
+                ],  # Tolerance value for the convergence of the algorithm
+            },
+            "Perceptron": {
+                "alpha": [1e-4, 1e-3, 1e-2],  # Regularization parameter
+                "penalty": ["l2", "l1", "elasticnet"],  # Regularization penalty
+                "max_iter": [1000, 2000],  # Maximum number of iterations
+                "eta0": [1.0, 0.1],  # Learning rate for gradient descent
+                "tol": [1e-3, 1e-4, 1e-5],  # Tolerance for stopping criteria
+                "random_state": [random_state],  # Random state for reproducibility
+            },
+            "Bernoulli Naive Bayes": {
+                "alpha": [0.1, 1.0, 10.0],  # Additive (Laplace) smoothing parameter
+                "binarize": [
+                    0.0,
+                    0.5,
+                    1.0,
+                ],  # Threshold for binarizing the input features
+                "fit_prior": [
+                    True,
+                    False,
+                ],  # Whether to learn class prior probabilities
+            },
+            "SGDClassifier": {
+                "eta0": [0.01, 0.1, 1.0],
+                "loss": [
+                    "hinge",
+                    "log",
+                    "modified_huber",
+                    "squared_hinge",
+                    "perceptron",
+                ],  # Loss function
+                "penalty": ["l2", "l1", "elasticnet"],  # Regularization penalty
+                "alpha": [1e-4, 1e-3, 1e-2],  # Regularization strength
+                "l1_ratio": [0.15, 0.5, 0.85],  # L1 ratio for elasticnet penalty
+                "max_iter": [1000, 2000],  # Maximum number of iterations
+                "tol": [1e-3, 1e-4],  # Tolerance for stopping criteria
+                "random_state": [random_state],  # Random state for reproducibility
+                "learning_rate": [
+                    "constant",
+                    "optimal",
+                    "invscaling",
+                    "adaptive",
+                ],  # Learning rate schedule
+            },
+            "Ridge": (
+                {"class_weight": [None, "balanced"]}
+                if purpose == "classification"
+                else {
+                    "alpha": [0.1, 1, 10, 100],
+                    "solver": [
+                        "auto",
+                        "svd",
+                        "cholesky",
+                        "lsqr",
+                    ],  # Solver for optimization
+                }
+            ),
         }
     results = {}
     # Use StratifiedKFold for classification and KFold for regression
     cv = (
@@ -2359,11 +2578,11 @@ def predict(
     # Train and validate each model
     for name, clf in tqdm(
-                models.items(),
-                desc="models",
-                colour="green",
-                bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
-            ):
+        models.items(),
+        desc="models",
+        colour="green",
+        bar_format="{l_bar}{bar} {n_fmt}/{total_fmt}",
+    ):
         if verbose:
             print(f"\nTraining and validating {name}:")
@@ -2381,7 +2600,7 @@ def predict(
         gs.fit(x_train, y_train)
         best_clf = gs.best_estimator_
         # make sure x_train and x_test has the same name
-        x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+        x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
         y_pred = best_clf.predict(x_true)
         # y_pred_proba
@@ -2396,18 +2615,23 @@ def predict(
             )
         else:
             y_pred_proba = None  # No probability output for certain models
         validation_scores = {}
         if y_true is not None:
-            validation_scores = cal_metrics(y_true, y_pred, y_pred_proba=y_pred_proba, purpose=purpose, average="weighted")
+            validation_scores = cal_metrics(
+                y_true,
+                y_pred,
+                y_pred_proba=y_pred_proba,
+                purpose=purpose,
+                average="weighted",
+            )
             # Calculate ROC curve
             # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
             if y_pred_proba is not None:
                 # fpr, tpr, roc_auc = dict(), dict(), dict()
                 fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
-                lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba,verbose=False)
+                lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False)
                 roc_auc = auc(fpr, tpr)
                 roc_info = {
                     "fpr": fpr.tolist(),
@@ -2425,11 +2649,14 @@ def predict(
                 }
             else:
                 roc_info, pr_info = None, None
-            if purpose=="classification":
+            if purpose == "classification":
                 results[name] = {
-                    "best_clf": gs.best_estimator_,
+                    "best_clf": gs.best_estimator_,
                     "best_params": gs.best_params_,
-                    "auc_indiv":[gs.cv_results_[f'split{i}_test_score'][gs.best_index_] for i in range(cv_folds)],
+                    "auc_indiv": [
+                        gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                        for i in range(cv_folds)
+                    ],
                     "scores": validation_scores,
                     "roc_curve": roc_info,
                     "pr_curve": pr_info,
@@ -2439,11 +2666,11 @@ def predict(
                         y_pred_proba.tolist() if y_pred_proba is not None else None
                     ),
                 }
-            else: # "regression"
+            else:  # "regression"
                 results[name] = {
-                    "best_clf": gs.best_estimator_,
+                    "best_clf": gs.best_estimator_,
                     "best_params": gs.best_params_,
-                    "scores": validation_scores, # e.g., neg_MSE, R², etc.
+                    "scores": validation_scores,  # e.g., neg_MSE, R², etc.
                     "predictions": y_pred.tolist(),
                     "predictions_proba": (
                         y_pred_proba.tolist() if y_pred_proba is not None else None
@@ -2452,9 +2679,9 @@ def predict(
         else:
             results[name] = {
-                "best_clf": gs.best_estimator_,
+                "best_clf": gs.best_estimator_,
                 "best_params": gs.best_params_,
-                "scores": validation_scores,
+                "scores": validation_scores,
                 "predictions": y_pred.tolist(),
                 "predictions_proba": (
                     y_pred_proba.tolist() if y_pred_proba is not None else None
@@ -2465,76 +2692,80 @@ def predict(
     df_results = pd.DataFrame.from_dict(results, orient="index")
     # sort
-    if y_true is not None and purpose=="classification":
+    if y_true is not None and purpose == "classification":
         df_scores = pd.DataFrame(
-                df_results["scores"].tolist(), index=df_results["scores"].index
-            ).sort_values(by="roc_auc", ascending=False)
-        df_results=df_results.loc[df_scores.index]
+            df_results["scores"].tolist(), index=df_results["scores"].index
+        ).sort_values(by="roc_auc", ascending=False)
+        df_results = df_results.loc[df_scores.index]
         if plot_:
             from datetime import datetime
             now_ = datetime.now().strftime("%y%m%d_%H%M%S")
-            nexttile=plot.subplot(figsize=[12, 10])
-            plot.heatmap(df_scores, kind="direct",ax=nexttile())
+            nexttile = plot.subplot(figsize=[12, 10])
+            plot.heatmap(df_scores, kind="direct", ax=nexttile())
             plot.figsets(xangle=30)
             if dir_save:
-                ips.figsave(dir_save+f"scores_sorted_heatmap{now_}.pdf")
-            if df_scores.shape[0]>1:# draw cluster
-                plot.heatmap(df_scores, kind="direct",cluster=True)
+                ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
+            if df_scores.shape[0] > 1:  # draw cluster
+                plot.heatmap(df_scores, kind="direct", cluster=True)
                 plot.figsets(xangle=30)
                 if dir_save:
-                    ips.figsave(dir_save+f"scores_clus{now_}.pdf")
-    if all([plot_, y_true is not None, purpose=='classification']):
+                    ips.figsave(dir_save + f"scores_clus{now_}.pdf")
+    if all([plot_, y_true is not None, purpose == "classification"]):
         try:
-            if len(models)>3:
+            if len(models) > 3:
                 plot_validate_features(df_results)
             else:
-                plot_validate_features_single(df_results,figsize=(12,4*len(models)))
+                plot_validate_features_single(df_results, figsize=(12, 4 * len(models)))
             if dir_save:
-                ips.figsave(dir_save+f"validate_features{now_}.pdf")
+                ips.figsave(dir_save + f"validate_features{now_}.pdf")
         except Exception as e:
             print(f"Error: 在画图的过程中出现了问题:{e}")
     return df_results
-def cal_metrics(y_true, y_pred, y_pred_proba=None, purpose="regression", average="weighted"):
+def cal_metrics(
+    y_true, y_pred, y_pred_proba=None, purpose="regression", average="weighted"
+):
     """
     Calculate regression or classification metrics based on the purpose.
     Parameters:
     - y_true: Array of true values.
     - y_pred: Array of predicted labels for classification or predicted values for regression.
     - y_pred_proba: Array of predicted probabilities for classification (optional).
     - purpose: str, "regression" or "classification".
     - average: str, averaging method for multi-class classification ("binary", "micro", "macro", "weighted", etc.).
     Returns:
     - validation_scores: dict of computed metrics.
     """
     from sklearn.metrics import (
-            mean_squared_error,
-            mean_absolute_error,
-            mean_absolute_percentage_error,
-            explained_variance_score,
-            r2_score,
-            mean_squared_log_error,
-            accuracy_score,
-            precision_score,
-            recall_score,
-            f1_score,
-            roc_auc_score,
-            matthews_corrcoef,
-            confusion_matrix,
-            balanced_accuracy_score,
-            average_precision_score,
-            precision_recall_curve
-        )
+        mean_squared_error,
+        mean_absolute_error,
+        mean_absolute_percentage_error,
+        explained_variance_score,
+        r2_score,
+        mean_squared_log_error,
+        accuracy_score,
+        precision_score,
+        recall_score,
+        f1_score,
+        roc_auc_score,
+        matthews_corrcoef,
+        confusion_matrix,
+        balanced_accuracy_score,
+        average_precision_score,
+        precision_recall_curve,
+    )
     validation_scores = {}
     if purpose == "regression":
         y_true = np.asarray(y_true)
         y_true = y_true.ravel()
-        y_pred = np.asarray(y_pred)
+        y_pred = np.asarray(y_pred)
         y_pred = y_pred.ravel()
         # Regression metrics
         validation_scores = {
@@ -2544,7 +2775,7 @@ def cal_metrics(y_true, y_pred, y_pred_proba=None, purpose="regression", average
             "r2": r2_score(y_true, y_pred),
             "mape": mean_absolute_percentage_error(y_true, y_pred),
             "explained_variance": explained_variance_score(y_true, y_pred),
-            "mbd": np.mean(y_pred - y_true)  # Mean Bias Deviation
+            "mbd": np.mean(y_pred - y_true),  # Mean Bias Deviation
         }
         # Check if MSLE can be calculated
         if np.all(y_true >= 0) and np.all(y_pred >= 0):  # Ensure no negative values
@@ -2560,21 +2791,24 @@ def cal_metrics(y_true, y_pred, y_pred_proba=None, purpose="regression", average
             "recall": recall_score(y_true, y_pred, average=average),
             "f1": f1_score(y_true, y_pred, average=average),
             "mcc": matthews_corrcoef(y_true, y_pred),
-            "specificity": None,
-            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred)
+            "specificity": None,
+            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
         }
         # Confusion matrix to calculate specificity
         tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
-        validation_scores["specificity"] = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity calculation
+        validation_scores["specificity"] = (
+            tn / (tn + fp) if (tn + fp) > 0 else 0
+        )  # Specificity calculation
-        if y_pred_proba is not None:
+        if y_pred_proba is not None:
             # Calculate ROC-AUC
-            validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
+            validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
             # PR-AUC (Precision-Recall AUC) calculation
             validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
     else:
-        raise ValueError("Invalid purpose specified. Choose 'regression' or 'classification'.")
+        raise ValueError(
+            "Invalid purpose specified. Choose 'regression' or 'classification'."
+        )
     return validation_scores

py2ls 0.2.4.10.3__py3-none-any.whl → 0.2.4.10.4__py3-none-any.whl

py2ls 0.2.4.10.3py3-none-any.whl → 0.2.4.10.4py3-none-any.whl