PyPI - py2ls - Versions diffs - 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl - Mend

py2ls 0.2.4.24py3-none-any.whl → 0.2.4.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/corr.py +475 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/data/styles/example/.DS_Store +0 -0
py2ls/data/usages_sns.json +6 -1
py2ls/ec2ls.py +61 -0
py2ls/ips.py +496 -138
py2ls/ml2ls.py +994 -288
py2ls/netfinder.py +16 -20
py2ls/nl2ls.py +283 -0
py2ls/plot.py +1244 -158
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +17 -14
py2ls/data/usages_pd copy.json +0 -1105
py2ls/ml2ls copy.py +0 -2906
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0

py2ls/ml2ls.py CHANGED Viewed

@@ -31,6 +31,7 @@ from sklearn.metrics import (
     average_precision_score,
 )
 from typing import Dict, Any, Optional, List, Union
+import os, json
 import numpy as np
 import pandas as pd
 from . import ips
@@ -49,7 +50,13 @@ logger = logging.getLogger()
 warnings.filterwarnings("ignore", category=UserWarning)
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.neighbors import KNeighborsClassifier
+#* set random_state global
+import torch
+import random
+random_state=1
+random.seed(random_state)
+np.random.seed(random_state)
+torch.manual_seed(random_state)
 def features_knn(
     x_train: pd.DataFrame, y_train: pd.Series, knn_params: dict
@@ -594,7 +601,7 @@ def get_features(
     """
     from sklearn.compose import ColumnTransformer
     from sklearn.preprocessing import StandardScaler, OneHotEncoder
+    from sklearn.model_selection import train_test_split
     # Ensure X and y are DataFrames/Series for consistency
     if isinstance(X, np.ndarray):
         X = pd.DataFrame(X)
@@ -922,10 +929,26 @@ def get_features(
             "feature_importances": feature_importances,
         }
         if all([plot_, dir_save]):
             from datetime import datetime
             now_ = datetime.now().strftime("%y%m%d_%H%M%S")
             ips.figsave(dir_save + f"features{now_}.pdf")
+            lists = []
+            for tp in ips.flatten(features_df["type"]):
+                lists.append(
+                    features_df
+                    .loc[features_df["type"] == tp, "feature"]
+                    .tolist()
+                )
+            labels = ips.flatten(features_df["type"])
+            # current_fig = plt.gcf()
+            # # ax = current_fig.add_subplot(3, 2, 6)
+            # gs = current_fig.add_gridspec(3, 2)
+            # ax = current_fig.add_subplot(gs[:, :])
+            plt.figure(figsize=[6,6])
+            plot.venn(lists, labels, cmap="coolwarm")
+            ips.figsave(dir_save + f"features{now_}shared_features.pdf")
     else:
         results = {
             "selected_features": pd.DataFrame(),
@@ -1247,22 +1270,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
         nexttile = plot.subplot(figsize=figsize)
         ax = nexttile(subplot_layout[0], subplot_layout[1])
         for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-            fpr = res_val["roc_curve"][model_name]["fpr"]
-            tpr = res_val["roc_curve"][model_name]["tpr"]
-            (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
-            mean_auc = res_val["roc_curve"][model_name]["auc"]
-            plot_roc_curve(
-                fpr,
-                tpr,
-                mean_auc,
-                lower_ci,
-                upper_ci,
-                model_name=model_name,
-                lw=1.5,
-                color=colors[i],
-                alpha=alpha,
-                ax=ax,
-            )
+            try:
+                fpr = res_val["roc_curve"][model_name]["fpr"]
+                tpr = res_val["roc_curve"][model_name]["tpr"]
+                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
+                mean_auc = res_val["roc_curve"][model_name]["auc"]
+                plot_roc_curve(
+                    fpr,
+                    tpr,
+                    mean_auc,
+                    lower_ci,
+                    upper_ci,
+                    model_name=model_name,
+                    lw=1.5,
+                    color=colors[i],
+                    alpha=alpha,
+                    ax=ax,
+                )
+            except Exception as e:
+                print(e)
         plot.figsets(
             sp=2,
             legend=dict(
@@ -1277,16 +1303,19 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
         ax = nexttile(subplot_layout[0], subplot_layout[1])
         for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-            plot_pr_curve(
-                recall=res_val["pr_curve"][model_name]["recall"],
-                precision=res_val["pr_curve"][model_name]["precision"],
-                avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
-                model_name=model_name,
-                color=colors[i],
-                lw=1.5,
-                alpha=alpha,
-                ax=ax,
-            )
+            try:
+                plot_pr_curve(
+                    recall=res_val["pr_curve"][model_name]["recall"],
+                    precision=res_val["pr_curve"][model_name]["precision"],
+                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
+                    model_name=model_name,
+                    color=colors[i],
+                    lw=1.5,
+                    alpha=alpha,
+                    ax=ax,
+                )
+            except Exception as e:
+                print(e)
         plot.figsets(
             sp=2,
             legend=dict(
@@ -1314,22 +1343,25 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
         for iclass, class_ in enumerate(classes):
             ax = nexttile(subplot_layout[0], subplot_layout[1])
             for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-                fpr = res_val["roc_curve"][model_name]["fpr"][class_]
-                tpr = res_val["roc_curve"][model_name]["tpr"][class_]
-                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
-                mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
-                plot_roc_curve(
-                    fpr,
-                    tpr,
-                    mean_auc,
-                    lower_ci,
-                    upper_ci,
-                    model_name=model_name,
-                    lw=1.5,
-                    color=colors[i],
-                    alpha=alpha,
-                    ax=ax,
-                )
+                try:
+                    fpr = res_val["roc_curve"][model_name]["fpr"][class_]
+                    tpr = res_val["roc_curve"][model_name]["tpr"][class_]
+                    (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
+                    mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
+                    plot_roc_curve(
+                        fpr,
+                        tpr,
+                        mean_auc,
+                        lower_ci,
+                        upper_ci,
+                        model_name=model_name,
+                        lw=1.5,
+                        color=colors[i],
+                        alpha=alpha,
+                        ax=ax,
+                    )
+                except Exception as e:
+                    print(e)
             plot.figsets(
                 sp=2,
                 title=class_,
@@ -1345,18 +1377,21 @@ def plot_validate_features(res_val, is_binary=True, figsize=None):
             ax = nexttile(subplot_layout[0], subplot_layout[1])
             for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-                plot_pr_curve(
-                    recall=res_val["pr_curve"][model_name]["recall"][iclass],
-                    precision=res_val["pr_curve"][model_name]["precision"][iclass],
-                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
-                        iclass
-                    ],
-                    model_name=model_name,
-                    color=colors[i],
-                    lw=1.5,
-                    alpha=alpha,
-                    ax=ax,
-                )
+                try:
+                    plot_pr_curve(
+                        recall=res_val["pr_curve"][model_name]["recall"][iclass],
+                        precision=res_val["pr_curve"][model_name]["precision"][iclass],
+                        avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
+                            iclass
+                        ],
+                        model_name=model_name,
+                        color=colors[i],
+                        lw=1.5,
+                        alpha=alpha,
+                        ax=ax,
+                    )
+                except Exception as e:
+                    print(e)
             plot.figsets(
                 sp=2,
                 title=class_,
@@ -1379,37 +1414,41 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
                 len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
             )
         for model_name in ips.flatten(res_val["pr_curve"].index):
-            fpr = res_val["roc_curve"][model_name]["fpr"]
-            tpr = res_val["roc_curve"][model_name]["tpr"]
-            (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
-            mean_auc = res_val["roc_curve"][model_name]["auc"]
-            # Plotting
-            plot_roc_curve(
-                fpr,
-                tpr,
-                mean_auc,
-                lower_ci,
-                upper_ci,
-                model_name=model_name,
-                ax=nexttile(),
-            )
-            plot.figsets(title=model_name, sp=2)
+            try:
+                fpr = res_val["roc_curve"][model_name]["fpr"]
+                tpr = res_val["roc_curve"][model_name]["tpr"]
+                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
+                mean_auc = res_val["roc_curve"][model_name]["auc"]
-            plot_pr_binary(
-                recall=res_val["pr_curve"][model_name]["recall"],
-                precision=res_val["pr_curve"][model_name]["precision"],
-                avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
-                model_name=model_name,
-                ax=nexttile(),
-            )
-            plot.figsets(title=model_name, sp=2)
+                # Plotting
+                plot_roc_curve(
+                    fpr,
+                    tpr,
+                    mean_auc,
+                    lower_ci,
+                    upper_ci,
+                    model_name=model_name,
+                    ax=nexttile(),
+                )
+                plot.figsets(title=model_name, sp=2)
-            # plot cm
-            plot_cm(
-                res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
-            )
-            plot.figsets(title=model_name, sp=2)
+                plot_pr_binary(
+                    recall=res_val["pr_curve"][model_name]["recall"],
+                    precision=res_val["pr_curve"][model_name]["precision"],
+                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
+                    model_name=model_name,
+                    ax=nexttile(),
+                )
+                plot.figsets(title=model_name, sp=2)
+                # plot cm
+                plot_cm(
+                    res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False
+                )
+                plot.figsets(title=model_name, sp=2)
+            except Exception as e:
+                print(e)
     else:
         modname_tmp = ips.flatten(res_val["roc_curve"].index)[0]
@@ -1424,22 +1463,25 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
         for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
             ax = nexttile()
             for iclass, class_ in enumerate(classes):
-                fpr = res_val["roc_curve"][model_name]["fpr"][class_]
-                tpr = res_val["roc_curve"][model_name]["tpr"][class_]
-                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
-                mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
-                plot_roc_curve(
-                    fpr,
-                    tpr,
-                    mean_auc,
-                    lower_ci,
-                    upper_ci,
-                    model_name=class_,
-                    lw=1.5,
-                    color=colors[iclass],
-                    alpha=0.03,
-                    ax=ax,
-                )
+                try:
+                    fpr = res_val["roc_curve"][model_name]["fpr"][class_]
+                    tpr = res_val["roc_curve"][model_name]["tpr"][class_]
+                    (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
+                    mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
+                    plot_roc_curve(
+                        fpr,
+                        tpr,
+                        mean_auc,
+                        lower_ci,
+                        upper_ci,
+                        model_name=class_,
+                        lw=1.5,
+                        color=colors[iclass],
+                        alpha=0.03,
+                        ax=ax,
+                    )
+                except Exception as e:
+                    print(e)
             plot.figsets(
                 sp=2,
                 title=model_name,
@@ -1451,18 +1493,21 @@ def plot_validate_features_single(res_val, figsize=None, is_binary=True):
             ax = nexttile()
             for iclass, class_ in enumerate(classes):
-                plot_pr_curve(
-                    recall=res_val["pr_curve"][model_name]["recall"][iclass],
-                    precision=res_val["pr_curve"][model_name]["precision"][iclass],
-                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
-                        iclass
-                    ],
-                    model_name=class_,
-                    color=colors[iclass],
-                    lw=1.5,
-                    alpha=0.03,
-                    ax=ax,
-                )
+                try:
+                    plot_pr_curve(
+                        recall=res_val["pr_curve"][model_name]["recall"][iclass],
+                        precision=res_val["pr_curve"][model_name]["precision"][iclass],
+                        avg_precision=res_val["pr_curve"][model_name]["avg_precision"][
+                            iclass
+                        ],
+                        model_name=class_,
+                        color=colors[iclass],
+                        lw=1.5,
+                        alpha=0.03,
+                        ax=ax,
+                    )
+                except Exception as e:
+                    print(e)
             plot.figsets(
                 sp=2,
                 title=class_,
@@ -1543,15 +1588,12 @@ def cal_auc_ci(
             # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
         sorted_scores = np.array(bootstrapped_scores)
         sorted_scores.sort()
-        # Computing the lower and upper bound of the 90% confidence interval
-        # You can change the bounds percentiles to 0.025 and 0.975 to get
-        # a 95% confidence interval instead.
         confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
         confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
         if verbose:
             print(
-                "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
+                "Confidence interval for the score: [{:0.3f} - {:0.3f}]".format(
                     confidence_lower, confidence_upper
                 )
             )
@@ -1568,11 +1610,8 @@ def cal_auc_ci(
             y_true, classes=np.unique(y_true)
         )  # One-vs-Rest transformation
         n_classes = y_true_bin.shape[1]  # Number of classes
-        bootstrapped_scores = np.zeros(
-            (n_classes, n_bootstraps)
-        )  # Store scores for each class
+        bootstrapped_scores = np.full((n_classes, n_bootstraps), np.nan)
         if verbose:
             print("AUROC scores for each class:")
             for i in range(n_classes):
@@ -1592,15 +1631,24 @@ def cal_auc_ci(
         # Calculating the confidence intervals for each class
         confidence_intervals = []
         for class_idx in range(n_classes):
-            sorted_scores = np.sort(bootstrapped_scores[class_idx])
-            confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
-            confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
-            confidence_intervals.append((confidence_lower, confidence_upper))
-            if verbose:
-                print(
-                    f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
-                )
+            # rm nan
+            valid_scores = bootstrapped_scores[class_idx][
+                ~np.isnan(bootstrapped_scores[class_idx])
+            ]
+            if len(valid_scores) > 0:
+                sorted_scores = np.sort(valid_scores)
+                confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
+                confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
+                confidence_intervals[class_idx] = (confidence_lower, confidence_upper)
+                if verbose:
+                    print(
+                        f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
+                    )
+            else:
+                confidence_intervals[class_idx] = (np.nan, np.nan)
+                if verbose:
+                    print(f"Class {class_idx} - Confidence interval: [NaN - NaN]")
         return confidence_intervals
@@ -2057,20 +2105,20 @@ def rank_models(
         def generate_bar_plot(ax, cv_test_scores):
             ax = plot.plotxy(
-                y="Classifier", x="combined_score", data=cv_test_scores, kind="bar"
+                y="Classifier", x="combined_score", data=cv_test_scores, kind_="bar"
             )
             plt.title("Classifier Performance")
             plt.tight_layout()
             return plt
-        nexttile = plot.subplot(2, 2, figsize=[10, 7])
+        nexttile = plot.subplot(2, 2, figsize=[10, 10])
         generate_bar_plot(nexttile(), top_models.dropna())
         plot.radar(
             ax=nexttile(projection="polar"),
             data=cv_test_scores.set_index("Classifier"),
-            ylim=[0.5, 1],
-            color=plot.get_color(10),
-            alpha=0.05,
+            ylim=[0, 1],
+            color=plot.get_color(cv_test_scores.set_index("Classifier").shape[1]),
+            alpha=0.02,
             circular=1,
         )
     return cv_test_scores
@@ -2206,6 +2254,8 @@ def predict(
     y_train: pd.Series,
     x_true: pd.DataFrame = None,
     y_true: Optional[pd.Series] = None,
+    fill_missing:bool = True,
+    scaler:str='standard',# ["standard", "minmax", "robust","maxabs"]
     backward: bool = False,  # backward_regression
     backward_thr:float = 0.05,# pval thr,only works when backward is True
     common_features: set = None,
@@ -2214,7 +2264,7 @@ def predict(
     metrics: Optional[List[str]] = None,
     stack:bool=True,# run stacking
     stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
-    vote:bool=True,# run voting
+    vote:bool=False,# run voting
     voting:str="hard", # only for classification purporse of voting
     n_top_models:int=5, #for stacking models
     n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
@@ -2227,7 +2277,12 @@ def predict(
     cv_level: str = "l",  # "s":'low',"m":'medium',"l":"high"
     class_weight: str = "balanced",
     random_state: int = 1,
+    presets = "best_quality",# specific for autogluon
+    time_limit=600, # specific for autogluon
+    num_bag_folds=5, # specific for autogluon
+    num_stack_levels=2, # specific for autogluon
     verbose: bool = False,
+    **kwargs
 ) -> pd.DataFrame:
     """
     第一种情况是内部拆分，第二种是直接预测，第三种是外部验证。
@@ -2278,28 +2333,20 @@ def predict(
         RandomForestRegressor,
         ExtraTreesClassifier,
         ExtraTreesRegressor,
+        HistGradientBoostingRegressor,
         BaggingClassifier,
         BaggingRegressor,
         AdaBoostClassifier,
         AdaBoostRegressor,
     )
-    from sklearn.svm import SVC, SVR
-    from sklearn.tree import DecisionTreeRegressor
+    from sklearn.svm import SVC, SVR, LinearSVR, NuSVR
+    from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
     from sklearn.linear_model import (
-        LogisticRegression,
-        ElasticNet,
-        ElasticNetCV,
-        LinearRegression,
-        Lasso,
-        RidgeClassifierCV,
-        Perceptron,
-        SGDClassifier,
-        RidgeCV,
-        Ridge,
-        TheilSenRegressor,
-        HuberRegressor,
-        PoissonRegressor,
+        LogisticRegression,ElasticNet,ElasticNetCV,
+        LinearRegression,Lasso,RidgeClassifierCV,Perceptron,SGDClassifier,
+        RidgeCV,Ridge,TheilSenRegressor,HuberRegressor,PoissonRegressor,Lars, LassoLars, BayesianRidge,
+        GammaRegressor, TweedieRegressor, LassoCV, LassoLarsCV, LarsCV,
+        OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
     )
     from sklearn.compose import TransformedTargetRegressor
     from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
@@ -2316,15 +2363,21 @@ def predict(
     )
     from sklearn.preprocessing import PolynomialFeatures
     from sklearn.model_selection import train_test_split
+    from sklearn.gaussian_process import GaussianProcessRegressor
+    from sklearn.kernel_ridge import KernelRidge
+    from sklearn.dummy import DummyRegressor
+    from autogluon.tabular import TabularPredictor
     # 拼写检查
     purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
     print(f"{purpose} processing...")
     # Default models or regressors if not provided
     if purpose == "classification":
         model_ = {
             "Random Forest": RandomForestClassifier(
-                random_state=random_state, class_weight=class_weight
+                random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
             ),
             # SVC (Support Vector Classification)
             "SVM": SVC(
@@ -2335,7 +2388,7 @@ def predict(
             ),
             # fit the best model without enforcing sparsity, which means it does not directly perform feature selection.
             "Logistic Regression": LogisticRegression(
-                class_weight=class_weight, random_state=random_state
+                class_weight=class_weight, random_state=random_state,n_jobs=n_jobs
             ),
             # Logistic Regression with L1 Regularization (Lasso)
             "Lasso Logistic Regression": LogisticRegression(
@@ -2346,51 +2399,70 @@ def predict(
                 eval_metric="logloss",
                 random_state=random_state,
             ),
-            "KNN": KNeighborsClassifier(n_neighbors=5),
+            "KNN": KNeighborsClassifier(n_neighbors=5,n_jobs=n_jobs),
             "Naive Bayes": GaussianNB(),
             "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
             "AdaBoost": AdaBoostClassifier(
                 algorithm="SAMME", random_state=random_state
             ),
-            # "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight),
+            "LightGBM": lgb.LGBMClassifier(random_state=random_state, class_weight=class_weight,n_jobs=n_jobs),
             "CatBoost": cb.CatBoostClassifier(verbose=0, random_state=random_state),
             "Extra Trees": ExtraTreesClassifier(
-                random_state=random_state, class_weight=class_weight
+                random_state=random_state, class_weight=class_weight,n_jobs=n_jobs
             ),
-            "Bagging": BaggingClassifier(random_state=random_state),
+            "Bagging": BaggingClassifier(random_state=random_state,n_jobs=n_jobs),
             "Neural Network": MLPClassifier(max_iter=500, random_state=random_state),
             "DecisionTree": DecisionTreeClassifier(),
             "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
             "Ridge": RidgeClassifierCV(
                 class_weight=class_weight, store_cv_results=True
             ),
-            "Perceptron": Perceptron(random_state=random_state),
+            "Perceptron": Perceptron(random_state=random_state,n_jobs=n_jobs),
             "Bernoulli Naive Bayes": BernoulliNB(),
-            "SGDClassifier": SGDClassifier(random_state=random_state),
+            "SGDClassifier": SGDClassifier(random_state=random_state,n_jobs=n_jobs),
         }
     elif purpose == "regression":
         model_ = {
-            "Random Forest": RandomForestRegressor(random_state=random_state),
+            "Random Forest": RandomForestRegressor(random_state=random_state,n_jobs=n_jobs),
             "SVM": SVR(),  # SVR (Support Vector Regression)
-            # "Lasso": Lasso(random_state=random_state), # 它和LassoCV相同(必须要提供alpha参数),
-            "LassoCV": LassoCV(
-                cv=cv_folds, random_state=random_state
-            ),  # LassoCV自动找出最适alpha,优于Lasso
+            "LassoCV": LassoCV(cv=cv_folds, random_state=random_state,n_jobs=n_jobs),  # LassoCV自动找出最适alpha,优于Lasso
             "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
-            "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
-            "Linear Regression": LinearRegression(),
+            "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state,n_jobs=n_jobs),
+            "Linear Regression": LinearRegression(n_jobs=n_jobs),
             "AdaBoost": AdaBoostRegressor(random_state=random_state),
-            # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
+            "LightGBM": lgb.LGBMRegressor(random_state=random_state,n_jobs=n_jobs,force_row_wise=True),  # Or use force_col_wise=True if memory is a concern
             "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
-            "Extra Trees": ExtraTreesRegressor(random_state=random_state),
-            "Bagging": BaggingRegressor(random_state=random_state),
+            "Extra Trees": ExtraTreesRegressor(random_state=random_state,n_jobs=n_jobs),
+            "Bagging": BaggingRegressor(random_state=random_state,n_jobs=n_jobs),
             "Neural Network": MLPRegressor(max_iter=500, random_state=random_state),
             "ElasticNet": ElasticNet(random_state=random_state),
-            "Ridge": Ridge(),
-            "KNN": KNeighborsRegressor(),
-            "TheilSen":TheilSenRegressor(),
+            "Ridge": Ridge(random_state=random_state),
+            "KNN": KNeighborsRegressor(n_jobs=n_jobs),
+            "TheilSen":TheilSenRegressor(n_jobs=n_jobs),
             "Huber":HuberRegressor(),
-            "Poisson":PoissonRegressor()
+            "Poisson":PoissonRegressor(),"LinearRegression": LinearRegression(),
+            "Lasso": Lasso(random_state=random_state),
+            "Lars": Lars(),
+            "LassoLars": LassoLars(),
+            "BayesianRidge": BayesianRidge(),
+            "GammaRegressor": GammaRegressor(),
+            "TweedieRegressor": TweedieRegressor(),
+            "LassoCV": LassoCV(random_state=random_state, n_jobs=n_jobs),
+            "ElasticNetCV": ElasticNetCV(random_state=random_state, n_jobs=n_jobs),
+            "LassoLarsCV": LassoLarsCV(n_jobs=n_jobs),
+            "LarsCV": LarsCV(),
+            "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
+            "OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(n_jobs=n_jobs),
+            "PassiveAggressiveRegressor": PassiveAggressiveRegressor(random_state=random_state),
+            "LinearSVR": LinearSVR(random_state=random_state),
+            "NuSVR": NuSVR(),
+            "DecisionTreeRegressor": DecisionTreeRegressor(random_state=random_state),
+            "ExtraTreeRegressor": ExtraTreeRegressor(random_state=random_state),
+            "HistGradientBoostingRegressor": HistGradientBoostingRegressor(random_state=random_state),
+            "GaussianProcessRegressor": GaussianProcessRegressor(),
+            "KernelRidge": KernelRidge(),
+            "DummyRegressor": DummyRegressor(),
+            "TransformedTargetRegressor": TransformedTargetRegressor(regressor=LinearRegression())
         }
     if cls is None:
         models = model_
@@ -2407,10 +2479,17 @@ def predict(
             ips.df_special_characters_cleaner(x_true) if x_true is not None else None
         )
+    # only keep "autogluon_tab" in models
+    cls =  [cls] if isinstance(cls, str) else cls
+    if cls is not None:
+        models={"autogluon_tab":None} if "auto" in cls else models
     # indicate cls:
     if ips.run_once_within(30):  # 10 min
         print(f"processing: {list(models.keys())}")
+    y_train_col_name=None
+    # print(isinstance(y_train, str) and y_train in x_train.columns)
     if isinstance(y_train, str) and y_train in x_train.columns:
         y_train_col_name = y_train
         y_train = x_train[y_train]
@@ -2418,6 +2497,7 @@ def predict(
         x_train = x_train.drop(y_train_col_name, axis=1)
     # else:
     #     y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
     y_train = pd.DataFrame(y_train)
     if y_train.select_dtypes(include=np.number).empty:
         y_train_ = ips.df_encoder(y_train, method="dummy", drop=None)
@@ -2430,6 +2510,9 @@ def predict(
         y_train = ips.df_encoder(pd.DataFrame(y_train), method="label")
     print("is_binary:", is_binary)
+    if fill_missing:
+        ips.df_fillna(data=x_train, method="knn", inplace=True, axis=0)
+        ips.df_fillna(data=y_train, method="knn", inplace=True, axis=0)
     # Perform backward feature selection
     if backward:
         selected_features = backward_regression(x_train, y_train, thr=backward_thr)
@@ -2458,6 +2541,8 @@ def predict(
                 pd.DataFrame(y_train), method="label"
             ).values.ravel()
+    if fill_missing:
+        ips.df_fillna(data=x_true, method="knn", inplace=True, axis=0)
     if y_true is not None:
         if isinstance(y_true, str) and y_true in x_true.columns:
             y_true_col_name = y_true
@@ -2490,11 +2575,16 @@ def predict(
     # Ensure common features are selected
     if common_features is not None:
         x_train, x_true = x_train[common_features], x_true[common_features]
+        share_col_names=common_features
     else:
         share_col_names = ips.shared(x_train.columns, x_true.columns, verbose=verbose)
         x_train, x_true = x_train[share_col_names], x_true[share_col_names]
-    x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
+    #! scaler
+    # scaler and fit x_train and export scaler to fit the x_true
+    x_train,scaler_=ips.df_scaler(x_train,method=scaler,return_scaler=True)
+    #
+    x_true=ips.df_scaler(x_true,scaler=scaler_)# make sure 用于同一个scaler
     x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
         x_true, method="dummy"
     )
@@ -2516,18 +2606,261 @@ def predict(
             if isinstance(y_train, np.ndarray):
                 y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
                 y_true = np.asarray(y_true)
-    # Hyperparameter grids for tuning
-    param_grid_common_xgb = {
-                'learning_rate': [0.01, 0.1, 0.2, 0.3],
-                'max_depth': [3, 5, 7, 10],
-                'n_estimators': [50, 100, 200, 300],
-                'subsample': [0.6, 0.8, 1.0],
-                'colsample_bytree': [0.6, 0.8, 1.0],
-                'gamma': [0, 0.1, 0.2, 0.5],
-                'min_child_weight': [1, 5, 10],
-                'reg_alpha': [0, 0.1, 0.5, 1],  # L1 regularization term
-                'reg_lambda': [1, 1.5, 2],  # L2 regularization term
+     #! so far, got the: x_train,x_true,y_train,y_true
+    # Grid search with KFold or StratifiedKFold
+    if "autogluon_tab" in models:
+        # load hypoer_param
+        f_param = os.path.dirname(os.path.abspath(__file__))
+        f_param = f_param + "/data/hyper_param_autogluon_zeroshot2024.json"
+        with open(f_param, "r") as file:
+            hyper_param_autogluon = json.load(file)
+        # Train the model with AutoGluon
+        features=x_train.columns.tolist()
+        label= y_train_col_name if y_train_col_name is not None else 'target'
+        df_autogluon = x_train.copy()
+        df_autogluon[label]=y_train
+        autogluon_presets=["best_quality","good_quality","fast_train"]
+        best_clf = TabularPredictor(label=label, path=os.path.join(dir_save,"model_autogluon")).fit(
+            train_data=df_autogluon,
+            presets=ips.strcmp(presets, autogluon_presets)[0],  # 'best_quality' or 'good_quality' or 'fast_train'
+            time_limit=time_limit,#3600,  # in sec:  Limit training time,
+            num_bag_folds=num_bag_folds,
+            num_stack_levels=num_stack_levels,
+            hyperparameters=hyper_param_autogluon,
+            verbosity=1 if verbose else 0,
+            **kwargs
+        )
+        #! Get the leaderboard
+        gs={}
+        # Display the leaderboard for reference
+        leaderboard = best_clf.leaderboard()
+        gs['info']=best_clf.info()
+        # gs["res"]=best_clf
+        gs["features"]=features
+        gs["leaderboard"] = leaderboard
+        best_model_name = leaderboard.iloc[0, 0]  # First row, first column contains the model name
+        # Store the best model and its details in the gs dictionary
+        gs["best_estimator_"] = best_model_name  # Store the trained model, not just the name
+        gs["best_params_"] = best_model_name  # Hyperparameters
+        # Make predictions if x_true is provided
+        if x_true is not None:
+            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+            gs["predictions"] = best_clf.predict(x_true[features],model=None)# model=None select the best
+            gs["predict_proba"] = best_clf.predict_proba(x_true[features]) if purpose=='classification' else None
+            x_true[label]=gs["predictions"]
+            if gs["predictions"].value_counts().shape[0]>1:
+                gs['evaluate'] = best_clf.evaluate(x_true[features+[label]])
+        gs["models"]=leaderboard["model"].tolist()#best_clf.model_names()
+        all_models = gs["models"]
+        model_evaluations = {}
+        for model in all_models:
+            predictions = best_clf.predict(x_true[features], model=model)
+            evaluation = best_clf.evaluate_predictions(
+                y_true=x_true[label],  # True labels
+                y_pred=predictions,    # Predictions from the specific model
+                auxiliary_metrics=True,  # Include additional metrics if needed
+            )
+            model_evaluations[model] = evaluation
+        gs["scores"]=pd.DataFrame.from_dict(model_evaluations, orient='index')
+        #! 试着保持一样的格式
+        results = {}
+        for model in all_models:
+            y_pred = best_clf.predict(x_true[features], model=model).tolist()
+            y_pred_proba=best_clf.predict_proba(x_true[features], model=model) if purpose=='classification' else None
+            if isinstance(y_pred_proba, pd.DataFrame):
+                y_pred_proba=y_pred_proba.iloc[:,1]
+            # try to make predict format consistant
+            try:
+                y_pred= [i[0] for i in y_pred]
+            except:
+                pass
+            try:
+                y_true= [i[0] for i in y_true]
+            except:
+                pass
+            try:
+                y_train= [i[0] for i in y_train]
+            except:
+                pass
+            validation_scores = {}
+            if y_true is not None and y_pred_proba is not None:
+                validation_scores = cal_metrics(
+                    y_true,
+                    y_pred,
+                    y_pred_proba=y_pred_proba,
+                    is_binary=is_binary,
+                    purpose=purpose,
+                    average="weighted",
+                )
+                if is_binary:
+                    # Calculate ROC curve
+                    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
+                    if y_pred_proba is not None:
+                        # fpr, tpr, roc_auc = dict(), dict(), dict()
+                        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+                        lower_ci, upper_ci = cal_auc_ci(
+                            y_true, y_pred_proba, verbose=False, is_binary=is_binary
+                        )
+                        roc_auc = auc(fpr, tpr)
+                        roc_info = {
+                            "fpr": fpr.tolist(),
+                            "tpr": tpr.tolist(),
+                            "auc": roc_auc,
+                            "ci95": (lower_ci, upper_ci),
+                        }
+                        # precision-recall curve
+                        precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
+                        avg_precision_ = average_precision_score(y_true, y_pred_proba)
+                        pr_info = {
+                            "precision": precision_,
+                            "recall": recall_,
+                            "avg_precision": avg_precision_,
+                        }
+                    else:
+                        roc_info, pr_info = None, None
+                    if purpose == "classification":
+                        results[model] = {
+                            # "best_clf": gs.best_estimator_,
+                            # "best_params": gs.best_params_,
+                            # "auc_indiv": [
+                            #     gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                            #     for i in range(cv_folds)
+                            # ],
+                            "scores": validation_scores,
+                            "roc_curve": roc_info,
+                            "pr_curve": pr_info,
+                            "confusion_matrix": confusion_matrix(y_true, y_pred),
+                            "predictions": y_pred,#.tolist(),
+                            "predictions_proba": (
+                                y_pred_proba.tolist() if y_pred_proba is not None else None
+                            ),
+                            "features":features,
+                            # "coef":coef_,
+                            # "alphas":alphas_
+                        }
+                    else:  # "regression"
+                        results[model] = {
+                            # "best_clf": gs.best_estimator_,
+                            # "best_params": gs.best_params_,
+                            "scores": validation_scores,  # e.g., neg_MSE, R², etc.
+                            "predictions": y_pred,#.tolist(),
+                            "predictions_proba": (
+                                y_pred_proba.tolist() if y_pred_proba is not None else None
+                            ),
+                            "features":features,
+                            # "coef":coef_,
+                            # "alphas":alphas_
+                        }
+                else:  # multi-classes
+                    if y_pred_proba is not None:
+                        # fpr, tpr, roc_auc = dict(), dict(), dict()
+                        # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+                        confidence_intervals = cal_auc_ci(
+                            y_true, y_pred_proba, verbose=False, is_binary=is_binary
+                        )
+                        roc_info = {
+                            "fpr": validation_scores["fpr"],
+                            "tpr": validation_scores["tpr"],
+                            "auc": validation_scores["roc_auc_by_class"],
+                            "ci95": confidence_intervals,
+                        }
+                        # precision-recall curve
+                        precision_, recall_, avg_precision_ = cal_precision_recall(
+                            y_true, y_pred_proba, is_binary=is_binary
+                        )
+                        pr_info = {
+                            "precision": precision_,
+                            "recall": recall_,
+                            "avg_precision": avg_precision_,
+                        }
+                    else:
+                        roc_info, pr_info = None, None
+                    if purpose == "classification":
+                        results[model] = {
+                            # "best_clf": gs.best_estimator_,
+                            # "best_params": gs.best_params_,
+                            # "auc_indiv": [
+                            #     gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                            #     for i in range(cv_folds)
+                            # ],
+                            "scores": validation_scores,
+                            "roc_curve": roc_info,
+                            "pr_curve": pr_info,
+                            "confusion_matrix": confusion_matrix(y_true, y_pred),
+                            "predictions": y_pred,#.tolist(),
+                            "predictions_proba": (
+                                y_pred_proba.tolist() if y_pred_proba is not None else None
+                            ),
+                            "features":features,
+                            # "coef":coef_,
+                            # "alphas":alphas_
+                        }
+                    else:  # "regression"
+                        results[model] = {
+                            # "best_clf": gs.best_estimator_,
+                            # "best_params": gs.best_params_,
+                            "scores": validation_scores,  # e.g., neg_MSE, R², etc.
+                            "predictions": y_pred,#.tolist(),
+                            "predictions_proba": (
+                                y_pred_proba.tolist() if y_pred_proba is not None else None
+                            ),
+                            "features":features,
+                            # "coef":coef_,
+                            # "alphas":alphas_
+                        }
+            else:
+                if y_true is None:
+                    validation_scores = []
+                else:
+                    validation_scores = cal_metrics(
+                        y_true,
+                        y_pred,
+                        y_pred_proba=y_pred_proba,
+                        is_binary=is_binary,
+                        purpose=purpose,
+                        average="weighted",
+                    )
+                results[model] = {
+                    # "best_clf": gs.best_estimator_,
+                    # "best_params": gs.best_params_,
+                    "scores": validation_scores,
+                    "predictions": y_pred,#.tolist(),
+                    "predictions_proba": (
+                        y_pred_proba.tolist() if y_pred_proba is not None else None
+                    ),
+                    "features":features,
+                    "y_train": y_train if y_train is not None else [],
+                    "y_true": y_true if y_true is not None else [],
+                    # "coef":coef_,
+                    # "alphas":alphas_
                 }
+            df_results = pd.DataFrame.from_dict(results, orient="index")
+            gs['res']=df_results
+        if all([plot_, y_true is not None, purpose == "classification"]):
+            from datetime import datetime
+            now_ = datetime.now().strftime("%y%m%d_%H%M%S")
+            # try:
+            if df_results.shape[0] > 3:
+                try:
+                    plot_validate_features(df_results, is_binary=is_binary)
+                except Exception as e:
+                    print(e)
+            else:
+                try:
+                    plot_validate_features_single(df_results, is_binary=is_binary)
+                except Exception as e:
+                    print(e)
+            if dir_save:
+                ips.figsave(dir_save + f"validate_features{now_}.pdf")
+        return gs
+    #! cross_valid
     if cv_level in ["low", "simple", "s", "l"]:
         param_grids = {
             "Random Forest": (
@@ -2696,7 +3029,73 @@ def predict(
                         'alpha': [0.1],
                         'max_iter': [100],},
             "Poisson":{'alpha': [0.1],
-                        'max_iter': [100],}
+                        'max_iter': [100],},
+            "Lars": {"n_nonzero_coefs": [10, 50, None]},
+            "LassoLars": {
+                "alpha": [0.01, 0.1, 1]
+            },
+            "BayesianRidge": {
+                "alpha_1": [1e-6, 1e-4, 1e-2],
+                "lambda_1": [1e-6, 1e-4, 1e-2]
+            },
+            "GammaRegressor": {
+                "alpha": [0.1, 1, 10]
+            },
+            "TweedieRegressor": {
+                "alpha": [0.1, 1, 10],
+                "power": [1, 1.5, 2]
+            },
+            "LassoCV": {
+                "cv": [5]
+            },
+            "ElasticNetCV": {
+                "l1_ratio": [0.2, 0.5, 0.8],
+                "cv": [5]
+            },
+            "LassoLarsCV": {
+                "cv": [5]
+            },
+            "LarsCV": {
+                "cv": [5]
+            },
+            "OrthogonalMatchingPursuit": {
+                "n_nonzero_coefs": [10, 50, None]
+            },
+            "OrthogonalMatchingPursuitCV": {
+                "cv": [5]
+            },
+            "PassiveAggressiveRegressor": {
+                "C": [0.1, 1, 10]
+            },
+            "LinearSVR": {
+                "C": [0.1, 1, 10]
+            },
+            "NuSVR": {
+                "C": [0.1, 1, 10]
+            },
+            "DecisionTreeRegressor": {
+                "max_depth": [5, 10, None]
+            },
+            "ExtraTreeRegressor": {
+                "max_depth": [5, 10, None]
+            },
+            "HistGradientBoostingRegressor": {
+                "learning_rate": [0.05, 0.1, 0.2],
+                "max_depth": [5, 10, None]
+            },
+            "GaussianProcessRegressor": {
+                "alpha": [1e-5, 1e-2, 0.1]
+            },
+            "KernelRidge": {
+                "alpha": [0.1, 1, 10],
+                "kernel": ["linear", "rbf"]
+            },
+            "DummyRegressor": {
+                "strategy": ["mean", "median"]
+            },
+            "TransformedTargetRegressor": {
+                "regressor__fit_intercept": [True, False]
+            }
         }
     elif cv_level in ["high", "advanced", "h"]:
         param_grids = {
@@ -2901,7 +3300,96 @@ def predict(
                         'alpha': [0.1, 1.0, 10.0],
                         'max_iter': [100, 200, 300],},
             "Poisson":{'alpha': [0.1, 1.0, 10.0],
-                        'max_iter': [100, 200, 300],}
+                        'max_iter': [100, 200, 300],},
+            "Lars": {
+                "n_nonzero_coefs": [10, 50, 100, 200, None]
+            },
+            "LassoLars": {
+                "alpha": [0.001, 0.01, 0.1, 1, 10]
+            },
+            "BayesianRidge": {
+                "alpha_1": [1e-6, 1e-5, 1e-4],
+                "alpha_2": [1e-6, 1e-5, 1e-4],
+                "lambda_1": [1e-6, 1e-5, 1e-4],
+                "lambda_2": [1e-6, 1e-5, 1e-4]
+            },
+            "GammaRegressor": {
+                "alpha": [0.01, 0.1, 1, 10],
+                "max_iter": [1000, 5000, 10000]
+            },
+            "TweedieRegressor": {
+                "alpha": [0.01, 0.1, 1, 10],
+                "power": [0, 1, 1.5, 2, 3]
+            },
+            "LassoCV": {
+                "alphas": [[0.001, 0.01, 0.1, 1, 10]],
+                "cv": [3, 5, 10]
+            },
+            "ElasticNetCV": {
+                "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
+                "alphas": [[0.001, 0.01, 0.1, 1, 10]],
+                "cv": [3, 5, 10]
+            },
+            "LassoLarsCV": {
+                "cv": [3, 5, 10]
+            },
+            "LarsCV": {
+                "cv": [3, 5, 10]
+            },
+            "OrthogonalMatchingPursuit": {
+                "n_nonzero_coefs": [10, 50, 100, 200, None]
+            },
+            "OrthogonalMatchingPursuitCV": {
+                "cv": [3, 5, 10]
+            },
+            "PassiveAggressiveRegressor": {
+                "C": [0.01, 0.1, 1, 10],
+                "max_iter": [1000, 5000, 10000],
+                "early_stopping": [True, False]
+            },
+            "LinearSVR": {
+                "C": [0.01, 0.1, 1, 10],
+                "epsilon": [0.01, 0.1, 1],
+                "max_iter": [1000, 5000, 10000]
+            },
+            "NuSVR": {
+                "C": [0.01, 0.1, 1, 10],
+                "nu": [0.25, 0.5, 0.75],
+                "kernel": ["linear", "poly", "rbf", "sigmoid"]
+            },
+            "DecisionTreeRegressor": {
+                "max_depth": [None, 5, 10, 20],
+                "min_samples_split": [2, 5, 10],
+                "min_samples_leaf": [1, 2, 4]
+            },
+            "ExtraTreeRegressor": {
+                "max_depth": [None, 5, 10, 20],
+                "min_samples_split": [2, 5, 10],
+                "min_samples_leaf": [1, 2, 4]
+            },
+            "HistGradientBoostingRegressor": {
+                "learning_rate": [0.01, 0.1, 0.2],
+                "max_iter": [100, 500, 1000],
+                "max_depth": [None, 5, 10, 20],
+                "min_samples_leaf": [1, 2, 4]
+            },
+            "GaussianProcessRegressor": {
+                "alpha": [1e-10, 1e-5, 1e-2, 0.1],
+                "n_restarts_optimizer": [0, 1, 5, 10]
+            },
+            "KernelRidge": {
+                "alpha": [0.01, 0.1, 1, 10],
+                "kernel": ["linear", "poly", "rbf", "sigmoid"],
+                "degree": [2, 3, 4]
+            },
+            "DummyRegressor": {
+                "strategy": ["mean", "median", "constant"],
+                "constant": [0]  # Only if strategy is 'constant'
+            },
+            "TransformedTargetRegressor": {
+                # Grid for the underlying regressor, example shown for LinearRegression
+                "regressor__fit_intercept": [True, False]
+            }
         }
     else:  # median level
         param_grids = {
@@ -3148,7 +3636,96 @@ def predict(
                         'alpha': [0.1, 1.0],
                         'max_iter': [100, 200],},
             "Poisson":{'alpha': [0.1, 1.0],
-                        'max_iter': [100, 200],}
+                        'max_iter': [100, 200],},
+            "Lars": {
+                "n_nonzero_coefs": [10, 50, 100, 200, None]
+            },
+            "LassoLars": {
+                "alpha": [0.001, 0.01, 0.1, 1, 10]
+            },
+            "BayesianRidge": {
+                "alpha_1": [1e-6, 1e-5, 1e-4],
+                "alpha_2": [1e-6, 1e-5, 1e-4],
+                "lambda_1": [1e-6, 1e-5, 1e-4],
+                "lambda_2": [1e-6, 1e-5, 1e-4]
+            },
+            "GammaRegressor": {
+                "alpha": [0.01, 0.1, 1, 10],
+                "max_iter": [1000, 5000, 10000]
+            },
+            "TweedieRegressor": {
+                "alpha": [0.01, 0.1, 1, 10],
+                "power": [0, 1, 1.5, 2, 3]
+            },
+            "LassoCV": {
+                "alphas": [[0.001, 0.01, 0.1, 1, 10]],
+                "cv": [3, 5, 10]
+            },
+            "ElasticNetCV": {
+                "l1_ratio": [0.1, 0.5, 0.7, 0.9, 1],
+                "alphas": [[0.001, 0.01, 0.1, 1, 10]],
+                "cv": [3, 5, 10]
+            },
+            "LassoLarsCV": {
+                "cv": [3, 5, 10]
+            },
+            "LarsCV": {
+                "cv": [3, 5, 10]
+            },
+            "OrthogonalMatchingPursuit": {
+                "n_nonzero_coefs": [10, 50, 100, 200, None]
+            },
+            "OrthogonalMatchingPursuitCV": {
+                "cv": [3, 5, 10]
+            },
+            "PassiveAggressiveRegressor": {
+                "C": [0.01, 0.1, 1, 10],
+                "max_iter": [1000, 5000, 10000],
+                "early_stopping": [True, False]
+            },
+            "LinearSVR": {
+                "C": [0.01, 0.1, 1, 10],
+                "epsilon": [0.01, 0.1, 1],
+                "max_iter": [1000, 5000, 10000]
+            },
+            "NuSVR": {
+                "C": [0.01, 0.1, 1, 10],
+                "nu": [0.25, 0.5, 0.75],
+                "kernel": ["linear", "poly", "rbf", "sigmoid"]
+            },
+            "DecisionTreeRegressor": {
+                "max_depth": [None, 5, 10, 20],
+                "min_samples_split": [2, 5, 10],
+                "min_samples_leaf": [1, 2, 4]
+            },
+            "ExtraTreeRegressor": {
+                "max_depth": [None, 5, 10, 20],
+                "min_samples_split": [2, 5, 10],
+                "min_samples_leaf": [1, 2, 4]
+            },
+            "HistGradientBoostingRegressor": {
+                "learning_rate": [0.01, 0.1, 0.2],
+                "max_iter": [100, 500, 1000],
+                "max_depth": [None, 5, 10, 20],
+                "min_samples_leaf": [1, 2, 4]
+            },
+            "GaussianProcessRegressor": {
+                "alpha": [1e-10, 1e-5, 1e-2, 0.1],
+                "n_restarts_optimizer": [0, 1, 5, 10]
+            },
+            "KernelRidge": {
+                "alpha": [0.01, 0.1, 1, 10],
+                "kernel": ["linear", "poly", "rbf", "sigmoid"],
+                "degree": [2, 3, 4]
+            },
+            "DummyRegressor": {
+                "strategy": ["mean", "median", "constant"],
+                "constant": [0]  # Only if strategy is 'constant'
+            },
+            "TransformedTargetRegressor": {
+                # Grid for the underlying regressor, example shown for LinearRegression
+                "regressor__fit_intercept": [True, False]
+            }
         }
     results = {}
@@ -3158,7 +3735,7 @@ def predict(
         if purpose == "classification"
         else KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
     )
     # Train and validate each model
     for name, clf in tqdm(
         models.items(),
@@ -3168,83 +3745,132 @@ def predict(
     ):
         if verbose:
             print(f"\nTraining and validating {name}:")
-        # Grid search with KFold or StratifiedKFold
-        if is_binary:
-            gs = GridSearchCV(
-                clf,
-                param_grid=param_grids.get(name, {}),
-                scoring=(
-                    "roc_auc"
-                    if purpose == "classification"
-                    else "neg_mean_squared_error"
-                ),
-                cv=cv,
-                n_jobs=n_jobs,
-                verbose=verbose,
-            )
-            gs.fit(x_train, y_train)
-            best_clf = gs.best_estimator_
-            # make sure x_train and x_test has the same name
-            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
-            y_pred = best_clf.predict(x_true)
-            if hasattr(best_clf, "predict_proba"):
-                y_pred_proba = best_clf.predict_proba(x_true)
-                print("Shape of predicted probabilities:", y_pred_proba.shape)
-                if y_pred_proba.shape[1] == 1:
-                    y_pred_proba = np.hstack(
-                        [1 - y_pred_proba, y_pred_proba]
-                    )  # Add missing class probabilities
-                y_pred_proba = y_pred_proba[:, 1]
-            elif hasattr(best_clf, "decision_function"):
-                # If predict_proba is not available, use decision_function (e.g., for SVM)
-                y_pred_proba = best_clf.decision_function(x_true)
-                # Ensure y_pred_proba is within 0 and 1 bounds
-                y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
-                    y_pred_proba.max() - y_pred_proba.min()
+        try:
+            if is_binary:
+                gs = GridSearchCV(
+                    clf,
+                    param_grid=param_grids.get(name, {}),
+                    scoring=(
+                        "roc_auc"
+                        if purpose == "classification"
+                        else "neg_mean_squared_error"
+                    ),
+                    cv=cv,
+                    n_jobs=n_jobs,
+                    verbose=verbose,
                 )
-            else:
-                y_pred_proba = None  # No probability output for certain models
-        else:
-            gs = GridSearchCV(
-                clf,
-                param_grid=param_grids.get(name, {}),
-                scoring=(
-                    "roc_auc_ovr"
-                    if purpose == "classification"
-                    else "neg_mean_squared_error"
-                ),
-                cv=cv,
-                n_jobs=n_jobs,
-                verbose=verbose,
-            )
-            # Fit GridSearchCV
-            gs.fit(x_train, y_train)
-            best_clf = gs.best_estimator_
-            # Ensure x_true aligns with x_train columns
-            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
-            y_pred = best_clf.predict(x_true)
-            # Handle prediction probabilities for multiclass
-            if hasattr(best_clf, "predict_proba"):
-                y_pred_proba = best_clf.predict_proba(x_true)
-            elif hasattr(best_clf, "decision_function"):
-                y_pred_proba = best_clf.decision_function(x_true)
-                # Normalize for multiclass if necessary
-                if y_pred_proba.ndim == 2:
-                    y_pred_proba = (
-                        y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
-                    ) / (
-                        y_pred_proba.max(axis=1, keepdims=True)
-                        - y_pred_proba.min(axis=1, keepdims=True)
+                gs.fit(x_train, y_train)
+                best_clf = gs.best_estimator_
+                # make sure x_train and x_test has the same name
+                x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+                y_pred = best_clf.predict(x_true)
+                if hasattr(best_clf, "predict_proba"):
+                    y_pred_proba = best_clf.predict_proba(x_true)
+                    print("Shape of predicted probabilities:", y_pred_proba.shape)
+                    if y_pred_proba.shape[1] == 1:
+                        y_pred_proba = np.hstack(
+                            [1 - y_pred_proba, y_pred_proba]
+                        )  # Add missing class probabilities
+                    if y_pred_proba.shape[1] == 2:
+                        if isinstance(y_pred_proba, pd.DataFrame):
+                            y_pred_proba = y_pred_proba.iloc[:, 1]
+                        elif isinstance(y_pred_proba, pd.Series):
+                            y_pred_proba = y_pred_proba.values[:, 1]
+                    else:
+                        y_pred_proba = y_pred_proba[:, 1]
+                elif hasattr(best_clf, "decision_function"):
+                    # If predict_proba is not available, use decision_function (e.g., for SVM)
+                    y_pred_proba = best_clf.decision_function(x_true)
+                    # Ensure y_pred_proba is within 0 and 1 bounds
+                    y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
+                        y_pred_proba.max() - y_pred_proba.min()
                     )
-            else:
-                y_pred_proba = None  # No probability output for certain models
+                else:
+                    y_pred_proba = None  # No probability output for certain models
+                # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
+                if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
+                    if hasattr(best_clf, "alphas_"):
+                        alphas_ = best_clf.alphas_
+                    elif hasattr(best_clf, "alpha_"):
+                        alphas_ = best_clf.alpha_
+                    elif hasattr(best_clf, "Cs_"):
+                        alphas_ = best_clf.Cs_
+                else:
+                    alphas_= None
+                coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
+            else:
+                gs = GridSearchCV(
+                    clf,
+                    param_grid=param_grids.get(name, {}),
+                    scoring=(
+                        "roc_auc_ovr"
+                        if purpose == "classification"
+                        else "neg_mean_squared_error"
+                    ),
+                    cv=cv,
+                    n_jobs=n_jobs,
+                    verbose=verbose,
+                )
+                # Fit GridSearchCV
+                gs.fit(x_train, y_train)
+                best_clf = gs.best_estimator_
+                # Ensure x_true aligns with x_train columns
+                x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+                # do i need to fit the x_train, y_train again?
+                best_clf=best_clf.fit(x_train, y_train)
+                y_pred = best_clf.predict(x_true)
+                # Handle prediction probabilities for multiclass
+                if hasattr(best_clf, "predict_proba"):
+                    y_pred_proba = best_clf.predict_proba(x_true)
+                elif hasattr(best_clf, "decision_function"):
+                    y_pred_proba = best_clf.decision_function(x_true)
+                    # Normalize for multiclass if necessary
+                    if y_pred_proba.ndim == 2:
+                        y_pred_proba = (
+                            y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)
+                        ) / (
+                            y_pred_proba.max(axis=1, keepdims=True)
+                            - y_pred_proba.min(axis=1, keepdims=True)
+                        )
+                else:
+                    y_pred_proba = None  # No probability output for certain models
+                # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
+                if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
+                    if hasattr(best_clf, "alphas_"):
+                        alphas_ = best_clf.alphas_
+                    elif hasattr(best_clf, "alpha_"):
+                        alphas_ = best_clf.alpha_
+                    elif hasattr(best_clf, "Cs_"):
+                        alphas_ = best_clf.Cs_
+                else:
+                    alphas_= None
+                coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
+        except Exception as e:
+            alphas_,coef_ = None,None
+            print(f"skiped {clf}: {e}")
+            continue
+        # try to make predict format consistant
+        try:
+           y_pred= [i[0] for i in y_pred]
+        except:
+            pass
+        try:
+           y_true= [i[0] for i in y_true]
+        except:
+            pass
+        try:
+           y_train= [i[0] for i in y_train]
+        except:
+            pass
         validation_scores = {}
         if y_true is not None and y_pred_proba is not None:
@@ -3294,20 +3920,26 @@ def predict(
                         "roc_curve": roc_info,
                         "pr_curve": pr_info,
                         "confusion_matrix": confusion_matrix(y_true, y_pred),
-                        "predictions": y_pred.tolist(),
+                        "predictions": y_pred,#.tolist(),
                         "predictions_proba": (
                             y_pred_proba.tolist() if y_pred_proba is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
                 else:  # "regression"
                     results[name] = {
                         "best_clf": gs.best_estimator_,
                         "best_params": gs.best_params_,
                         "scores": validation_scores,  # e.g., neg_MSE, R², etc.
-                        "predictions": y_pred.tolist(),
+                        "predictions": y_pred,#.tolist(),
                         "predictions_proba": (
                             y_pred_proba.tolist() if y_pred_proba is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
             else:  # multi-classes
                 if y_pred_proba is not None:
@@ -3346,20 +3978,26 @@ def predict(
                         "roc_curve": roc_info,
                         "pr_curve": pr_info,
                         "confusion_matrix": confusion_matrix(y_true, y_pred),
-                        "predictions": y_pred.tolist(),
+                        "predictions": y_pred,#.tolist(),
                         "predictions_proba": (
                             y_pred_proba.tolist() if y_pred_proba is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
                 else:  # "regression"
                     results[name] = {
                         "best_clf": gs.best_estimator_,
                         "best_params": gs.best_params_,
                         "scores": validation_scores,  # e.g., neg_MSE, R², etc.
-                        "predictions": y_pred.tolist(),
+                        "predictions": y_pred,#.tolist(),
                         "predictions_proba": (
                             y_pred_proba.tolist() if y_pred_proba is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
         else:
@@ -3378,17 +4016,21 @@ def predict(
                 "best_clf": gs.best_estimator_,
                 "best_params": gs.best_params_,
                 "scores": validation_scores,
-                "predictions": y_pred.tolist(),
+                "predictions": y_pred,#.tolist(),
                 "predictions_proba": (
                     y_pred_proba.tolist() if y_pred_proba is not None else None
                 ),
+                "features":share_col_names,
                 "y_train": y_train if y_train is not None else [],
                 "y_true": y_true if y_true is not None else [],
+                "coef":coef_,
+                "alphas":alphas_
             }
     # Convert results to DataFrame
     df_results = pd.DataFrame.from_dict(results, orient="index")
-    # sort
+    display(df_results)
+    # sort
     if y_true is not None:
         if purpose == "classification":
             df_scores = pd.DataFrame(
@@ -3446,7 +4088,7 @@ def predict(
         for i, j in top_models.to_dict().items():
             base_estimators.append((i, j))
         if stacking_cv:
-            print(f" ⤵ stacking_cv is processing...")
+            print(f"⤵ stacking_cv is processing...")
             #* 定义几个象征性的final_estimator
             # 备选的几种
             if purpose == "classification":
@@ -3520,7 +4162,7 @@ def predict(
             best_final_estimator = cv_results_df.iloc[0]['final_estimator']
             print(f"Best final estimator based on cross-validation: {best_final_estimator}")
         else:
-            print(f" ⤵ trying to find the best_final_estimator for stacking...")
+            print(f"⤵ trying to find the best_final_estimator for stacking...")
             if purpose=="classification":
                 best_final_estimator = LogisticRegression(class_weight=class_weight,
                                                         random_state=random_state,
@@ -3530,26 +4172,25 @@ def predict(
         print(f"⤵ the best best_final_estimator: {best_final_estimator}")
         #! apply stacking
         if purpose == "classification":
-            print(f" ⤵ StackingClassifier...")
+            print(f"⤵ StackingClassifier...")
             stacking_model = StackingClassifier(estimators=base_estimators,
                                                 final_estimator=best_final_estimator,
                                                 cv=cv)
         else:
-            print(f" ⤵ StackingRegressor...")
+            print(f"⤵ StackingRegressor...")
             stacking_model = StackingRegressor(estimators=base_estimators,
                                             final_estimator=best_final_estimator,
                                             cv=cv)
         # Train the Stacking Classifier
-        print(f" ⤵ fit & predict...")
+        print(f"⤵ fit & predict...")
         stacking_model.fit(x_train, y_train)
         y_pred_final = stacking_model.predict(x_true)
-        print(f" ⤵ collecting results...")
+        print(f"⤵ collecting results...")
         # pred_proba
         if is_binary:
             if hasattr(stacking_model, "predict_proba"):
                 y_pred_proba_final = stacking_model.predict_proba(x_true)
-                print("Shape of predicted probabilities:", y_pred_proba_final.shape)
                 if y_pred_proba_final.shape[1] == 1:
                     y_pred_proba_final = np.hstack(
                         [1 - y_pred_proba_final, y_pred_proba_final]
@@ -3564,6 +4205,17 @@ def predict(
                 )
             else:
                 y_pred_proba_final = None  # No probability output for certain models
+            # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
+            if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
+                if hasattr(best_clf, "alphas_"):
+                    alphas_ = best_clf.alphas_
+                elif hasattr(best_clf, "alpha_"):
+                    alphas_ = best_clf.alpha_
+                elif hasattr(best_clf, "Cs_"):
+                    alphas_ = best_clf.Cs_
+            else:
+                alphas_= None
+            coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
         if not is_binary:
             # Handle prediction probabilities for multiclass
             if hasattr(stacking_model, "predict_proba"):
@@ -3581,6 +4233,17 @@ def predict(
                     )
             else:
                 y_pred_proba_final = None  # No probability output for certain models
+            # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
+            if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
+                if hasattr(best_clf, "alphas_"):
+                    alphas_ = best_clf.alphas_
+                elif hasattr(best_clf, "alpha_"):
+                    alphas_ = best_clf.alpha_
+                elif hasattr(best_clf, "Cs_"):
+                    alphas_ = best_clf.Cs_
+            else:
+                alphas_= None
+            coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
         #! dict_pred_stack
         dict_pred_stack={}
         validation_scores_final = {}
@@ -3631,6 +4294,9 @@ def predict(
                         "predictions_proba": (
                             y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
                 else:  # "regression"
                     dict_pred_stack = {
@@ -3641,6 +4307,9 @@ def predict(
                         "predictions_proba": (
                             y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
             else:  # multi-classes
                 if y_pred_proba_final is not None:
@@ -3680,6 +4349,9 @@ def predict(
                         "predictions_proba": (
                             y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
                 else:  # "regression"
                     dict_pred_stack = {
@@ -3690,6 +4362,9 @@ def predict(
                         "predictions_proba": (
                             y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
                         ),
+                        "features":share_col_names,
+                        "coef":coef_,
+                        "alphas":alphas_
                     }
         else:
@@ -3712,8 +4387,11 @@ def predict(
                 "predictions_proba": (
                     y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
                 ),
+                "features":share_col_names,
                 "y_train": y_train if y_train is not None else [],
                 "y_true": y_true if y_true is not None else [],
+                "coef":coef_,
+                "alphas":alphas_
             }
         # merge together
         df_pred = pd.DataFrame(
@@ -3728,16 +4406,16 @@ def predict(
         #     if dir_save:
         #         ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
     if vote:
-        print(f" ⤵ voting...")
+        print(f"⤵ voting...")
         from sklearn.ensemble import VotingClassifier, VotingRegressor
-        #! Votting
+        #! voting
         n_top_models = min(n_top_models, df_results.shape[0])
         base_estimators=[]
         for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
             base_estimators.append((name,cls))
         # Apply Voting Classifier/Regressor
         if purpose == "classification":
-            print(f" ⤵ VotingClassifier...via{votting}")
+            print(f"⤵ VotingClassifier...via{voting}")
             if voting=='hard':
                 # Hard voting does not support `predict_proba`
                 voting_model = VotingClassifier(estimators=base_estimators)
@@ -3745,7 +4423,7 @@ def predict(
                 # Soft voting supports `predict_proba`
                 voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
         else:
-            print(f" ⤵ VotingRegressor...")
+            print(f"⤵ VotingRegressor...")
             voting_model = VotingRegressor(estimators=base_estimators)
         # Train the Voting Classifier/Regressor
@@ -3770,10 +4448,23 @@ def predict(
                 y_pred_proba_vote = y_pred_proba_vote[:, 1]
             else:
                 y_pred_proba_vote = None
+            # Access alphas if applicable (e.g., ElasticNetCV, LassoCV)
+            if hasattr(best_clf, "alphas_") or hasattr(best_clf, "Cs_"):
+                if hasattr(best_clf, "alphas_"):
+                    alphas_ = best_clf.alphas_
+                elif hasattr(best_clf, "alpha_"):
+                    alphas_ = best_clf.alpha_
+                elif hasattr(best_clf, "Cs_"):
+                    alphas_ = best_clf.Cs_
+            else:
+                alphas_= None
+            coef_ = best_clf.coef_ if hasattr(best_clf, "coef_") else None
         else:  # Regression
             y_pred_proba_vote = None
+            coef_,alphas_=None,None
-        print(f" ⤵ collecting voting results...")
+        print(f"⤵ collecting voting results...")
         #! dict_pred_vote
         dict_pred_vote = {}
         validation_scores_vote = {}
@@ -3822,6 +4513,9 @@ def predict(
                     "predictions_proba": (
                         y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
                     ),
+                    "features":share_col_names,
+                    "coef":coef_,
+                    "alphas":alphas_
                 }
             else:  # Multi-class
                 if y_pred_proba_vote is not None:
@@ -3856,6 +4550,9 @@ def predict(
                     "predictions_proba": (
                         y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
                     ),
+                    "features":share_col_names,
+                    "coef":coef_,
+                    "alphas":alphas_
                 }
         else:
             if y_true is None:
@@ -3877,6 +4574,7 @@ def predict(
                 "predictions_proba": (
                     y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
                 ),
+                "features":share_col_names,
                 "y_train": y_train if y_train is not None else [],
                 "y_true": y_true if y_true is not None else [],
             }
@@ -3900,6 +4598,8 @@ def predict(
         df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
     elif stack:
         df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
+    else:
+        df_res=df_results
     if all([plot_, y_true is not None, purpose == "classification"]):
         from datetime import datetime
@@ -3907,9 +4607,15 @@ def predict(
         now_ = datetime.now().strftime("%y%m%d_%H%M%S")
         # try:
         if df_res.shape[0] > 3:
-            plot_validate_features(df_res, is_binary=is_binary)
+            try:
+                plot_validate_features(df_res, is_binary=is_binary)
+            except Exception as e:
+                print(e)
         else:
-            plot_validate_features_single(df_res, is_binary=is_binary)
+            try:
+                plot_validate_features_single(df_res, is_binary=is_binary)
+            except Exception as e:
+                print(e)
         if dir_save:
             ips.figsave(dir_save + f"validate_features{now_}.pdf")
     # except Exception as e:

py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

py2ls 0.2.4.24py3-none-any.whl → 0.2.4.26py3-none-any.whl