PyPI - py2ls - Versions diffs - 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl - Mend

py2ls 0.2.4.22py3-none-any.whl → 0.2.4.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/.git/objects/.DS_Store +0 -0
py2ls/.git/refs/.DS_Store +0 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/ips.py +213 -195
py2ls/ml2ls.py +774 -66
{py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
{py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
{py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0

py2ls/ml2ls.py CHANGED Viewed

@@ -5,7 +5,6 @@ from sklearn.ensemble import (
     BaggingClassifier,
 )
 from sklearn.svm import SVC, SVR
-from sklearn.calibration import CalibratedClassifierCV
 from sklearn.model_selection import GridSearchCV, StratifiedKFold
 from sklearn.linear_model import (
     LassoCV,
@@ -16,12 +15,7 @@ from sklearn.linear_model import (
     RidgeClassifierCV,
     ElasticNet,
 )
-from sklearn.feature_selection import RFE
-from sklearn.naive_bayes import GaussianNB
-from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-import xgboost as xgb  # Make sure you have xgboost installed
-from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.metrics import (
     accuracy_score,
     precision_score,
@@ -36,18 +30,12 @@ from sklearn.metrics import (
     precision_recall_curve,
     average_precision_score,
 )
-from imblearn.over_sampling import SMOTE
-from sklearn.pipeline import Pipeline
-from collections import defaultdict
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from typing import Dict, Any, Optional, List, Union
 import numpy as np
 import pandas as pd
 from . import ips
 from . import plot
 import matplotlib.pyplot as plt
-import seaborn as sns
 plt.style.use(str(ips.get_cwd()) + "/data/styles/stylelib/paper.mplstyle")
 import logging
 import warnings
@@ -314,6 +302,8 @@ def features_svm(
                 - Use case: It’s not as widely used as the RBF or linear kernel but can be explored when there is some evidence of non-linear
                     S-shaped relationships.
     """
+    from sklearn.feature_selection import RFE
+    from sklearn.svm import SVC
     # SVM (Support Vector Machines)
     svc = SVC(kernel=rfe_params["kernel"])  # ["linear", "rbf", "poly", "sigmoid"]
     # RFE(Recursive Feature Elimination)
@@ -450,6 +440,7 @@ def validate_classifier(
     Returns:
     - results: Dictionary containing average cv_train_scores and cv_test_scores.
     """
+    from sklearn.model_selection import cross_val_score
     cv_train_scores = {metric: [] for metric in metrics}
     skf = StratifiedKFold(n_splits=cv_folds)
     # Perform cross-validation
@@ -982,6 +973,8 @@ def validate_features(
     """
     from tqdm import tqdm
+    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+    from sklearn.calibration import CalibratedClassifierCV
     # Ensure common features are selected
     common_features = ips.shared(
@@ -1001,6 +994,7 @@ def validate_features(
     # Handle class imbalance using SMOTE
     if smote:
+        from imblearn.over_sampling import SMOTE
         if (
             y_train.value_counts(normalize=True).max() < 0.8
         ):  # Threshold to decide if data is imbalanced
@@ -2096,7 +2090,116 @@ def rank_models(
 # )
 # figsave("classifier_performance.pdf")
+def rank_models_reg(df, ascending=False):
+    """
+    Sorts models based on MSE, RMSE, MAE, and R² with custom priority logic.
+    Parameters:
+        df (pd.DataFrame): DataFrame containing the regression metrics.
+        ascending (bool): Whether to sort in ascending order of ranking score.
+    Returns:
+        pd.DataFrame: Sorted DataFrame with an added "Ranking_Score" column.
+    """
+    # Define weights for the 4 metrics
+    weights = {
+        "mse": -1,  # Lower is better
+        "rmse": -1,  # Lower is better
+        "mae": -1,  # Lower is better
+        "r2": 1,    # Higher is better
+    }
+    # Normalize the selected metrics
+    df = df.copy()  # Work on a copy of the DataFrame
+    for metric, weight in weights.items():
+        if metric in df.columns:
+            if weight > 0:  # Higher is better; normalize 0-1
+                df[metric + "_normalized"] = (df[metric] - df[metric].min()) / (
+                    df[metric].max() - df[metric].min()
+                )
+            else:  # Lower is better; reverse normalize 0-1
+                df[metric + "_normalized"] = (df[metric].max() - df[metric]) / (
+                    df[metric].max() - df[metric].min()
+                )
+    # Calculate ranking score as a weighted sum
+    df["Ranking_Score"] = sum(
+        df[metric + "_normalized"] * abs(weights[metric])
+        for metric in weights.keys()
+        if metric + "_normalized" in df.columns
+    )
+    # Sort models based on the ranking score
+    sorted_df = df.sort_values(by="Ranking_Score", ascending=ascending)
+    return sorted_df
+models_support = {
+    "classification": {
+        "Random Forest": "Tree-Based",
+        "SVM": "Kernel-Based",
+        "Logistic Regression": "Linear",
+        "Lasso Logistic Regression": "Linear",
+        "Gradient Boosting": "Tree-Based",
+        "XGBoost": "Tree-Based",
+        "KNN": "Instance-Based",
+        "Naive Bayes": "Probabilistic",
+        "Linear Discriminant Analysis": "Linear",
+        "AdaBoost": "Tree-Based",
+        "CatBoost": "Tree-Based",
+        "Extra Trees": "Tree-Based",
+        "Bagging": "Tree-Based",
+        "Neural Network": "Neural Network",
+        "DecisionTree": "Tree-Based",
+        "Quadratic Discriminant Analysis": "Probabilistic",
+        "Ridge": "Linear",
+        "Perceptron": "Linear",
+        "Bernoulli Naive Bayes": "Probabilistic",
+        "SGDClassifier": "Linear",
+    },
+    "regression": {
+        "Linear Regression": "Linear",
+        "Ridge": "Linear",
+        "RidgeCV": "Linear",
+        "TheilSenRegressor": "Linear",
+        "HuberRegressor": "Linear",
+        "PoissonRegressor": "Linear",
+        "LassoCV": "Linear",
+        "Bagging": "Tree-Based",
+        "ElasticNet": "Linear",
+        "Random Forest": "Tree-Based",
+        "Gradient Boosting": "Tree-Based",
+        "XGBoost": "Tree-Based",
+        "CatBoost": "Tree-Based",
+        "Extra Trees": "Tree-Based",
+        "SVM": "Kernel-Based",
+        "KNN": "Instance-Based",
+        "Neural Network": "Neural Network",
+        "AdaBoost": "Linear",
+    },
+}
+def select_top_models(models, categories, n_top_models, n_models_per_category=1):
+    """
+    models = list_sort
+    purpose = "regression"
+    categories = models_support[purpose]
+    n_top_models = 3
+    select_top_models(models, categories, n_top_models)
+    """
+    selected = {}
+    result = []
+    for model in models:
+        category = categories.get(model, "Unknown")
+        if category not in selected:
+            selected[category] = 0  # Initialize counter for the category
+        if selected[category] < n_models_per_category:  # Allow additional models up to the limit
+            selected[category] += 1
+            result.append(model)
+        if len(result) == n_top_models:  # Stop when the desired number of models is reached
+            break
+    return result
 def predict(
     x_train: pd.DataFrame,
@@ -2104,11 +2207,17 @@ def predict(
     x_true: pd.DataFrame = None,
     y_true: Optional[pd.Series] = None,
     backward: bool = False,  # backward_regression
+    backward_thr:float = 0.05,# pval thr,only works when backward is True
     common_features: set = None,
     purpose: str = "classification",  # 'classification' or 'regression'
     cls: Optional[Dict[str, Any]] = None,
     metrics: Optional[List[str]] = None,
-    random_state: int = 1,
+    stack:bool=True,# run stacking
+    stacking_cv:bool=False,# stacking cross_validate, default(False),keep it simple
+    vote:bool=True,# run voting
+    voting:str="hard", # only for classification purporse of voting
+    n_top_models:int=5, #for stacking models
+    n_models_per_category:int=1, #for stacking models,可以允许同一个类别2种模型
     smote: bool = False,
     n_jobs: int = -1,
     plot_: bool = True,
@@ -2117,6 +2226,7 @@ def predict(
     cv_folds: int = 5,  # more cv_folds 得更加稳定,auc可能更低
     cv_level: str = "l",  # "s":'low',"m":'medium',"l":"high"
     class_weight: str = "balanced",
+    random_state: int = 1,
     verbose: bool = False,
 ) -> pd.DataFrame:
     """
@@ -2184,10 +2294,17 @@ def predict(
         RidgeClassifierCV,
         Perceptron,
         SGDClassifier,
+        RidgeCV,
+        Ridge,
+        TheilSenRegressor,
+        HuberRegressor,
+        PoissonRegressor,
     )
+    from sklearn.compose import TransformedTargetRegressor
     from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
     from sklearn.naive_bayes import GaussianNB, BernoulliNB
-    from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+    from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor,StackingClassifier,StackingRegressor
     import xgboost as xgb
     import lightgbm as lgb
     import catboost as cb
@@ -2198,6 +2315,7 @@ def predict(
         QuadraticDiscriminantAnalysis,
     )
     from sklearn.preprocessing import PolynomialFeatures
+    from sklearn.model_selection import train_test_split
     # 拼写检查
     purpose = ips.strcmp(purpose, ["classification", "regression"])[0]
@@ -2261,7 +2379,6 @@ def predict(
             "Gradient Boosting": GradientBoostingRegressor(random_state=random_state),
             "XGBoost": xgb.XGBRegressor(eval_metric="rmse", random_state=random_state),
             "Linear Regression": LinearRegression(),
-            "Lasso": Lasso(random_state=random_state),
             "AdaBoost": AdaBoostRegressor(random_state=random_state),
             # "LightGBM": lgb.LGBMRegressor(random_state=random_state),
             "CatBoost": cb.CatBoostRegressor(verbose=0, random_state=random_state),
@@ -2271,10 +2388,10 @@ def predict(
             "ElasticNet": ElasticNet(random_state=random_state),
             "Ridge": Ridge(),
             "KNN": KNeighborsRegressor(),
+            "TheilSen":TheilSenRegressor(),
+            "Huber":HuberRegressor(),
+            "Poisson":PoissonRegressor()
         }
-    # indicate cls:
-    if ips.run_once_within(30):  # 10 min
-        print(f"supported models: {list(model_.keys())}")
     if cls is None:
         models = model_
     else:
@@ -2290,6 +2407,10 @@ def predict(
             ips.df_special_characters_cleaner(x_true) if x_true is not None else None
         )
+    # indicate cls:
+    if ips.run_once_within(30):  # 10 min
+        print(f"processing: {list(models.keys())}")
     if isinstance(y_train, str) and y_train in x_train.columns:
         y_train_col_name = y_train
         y_train = x_train[y_train]
@@ -2311,7 +2432,7 @@ def predict(
     # Perform backward feature selection
     if backward:
-        selected_features = backward_regression(x_train, y_train, threshold_out=0.05)
+        selected_features = backward_regression(x_train, y_train, thr=backward_thr)
         x_train = x_train[selected_features]
     if x_true is None:
@@ -2391,10 +2512,22 @@ def predict(
         if isinstance(y_train, np.ndarray):
             y_train = ips.df_encoder(data=pd.DataFrame(y_train), method="label")
             y_train = np.asarray(y_train)
-        if isinstance(y_train, np.ndarray):
-            y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
-            y_true = np.asarray(y_true)
+        if y_true is not None:
+            if isinstance(y_train, np.ndarray):
+                y_true = ips.df_encoder(data=pd.DataFrame(y_true), method="label")
+                y_true = np.asarray(y_true)
     # Hyperparameter grids for tuning
+    param_grid_common_xgb = {
+                'learning_rate': [0.01, 0.1, 0.2, 0.3],
+                'max_depth': [3, 5, 7, 10],
+                'n_estimators': [50, 100, 200, 300],
+                'subsample': [0.6, 0.8, 1.0],
+                'colsample_bytree': [0.6, 0.8, 1.0],
+                'gamma': [0, 0.1, 0.2, 0.5],
+                'min_child_weight': [1, 5, 10],
+                'reg_alpha': [0, 0.1, 0.5, 1],  # L1 regularization term
+                'reg_lambda': [1, 1.5, 2],  # L2 regularization term
+                }
     if cv_level in ["low", "simple", "s", "l"]:
         param_grids = {
             "Random Forest": (
@@ -2416,8 +2549,8 @@ def predict(
                 }
             ),
             "SVM": {
-                "C": [1],
-                "gamma": ["scale"],
+                "C": [0.1, 1, 10],
+                "gamma": ["scale", 0.1, 1],
                 "kernel": ["rbf"],
             },
             "Lasso": {
@@ -2439,12 +2572,17 @@ def predict(
                 "min_samples_split": [2],
                 "subsample": [0.8],
             },
-            "XGBoost": {
-                "n_estimators": [100],
-                "max_depth": [3],
-                "learning_rate": [0.1],
-                "subsample": [0.8],
-                "colsample_bytree": [0.8],
+            "XGBoost":{
+                'learning_rate': [0.01],
+                'max_depth': [3],
+                'n_estimators': [50],
+                'subsample': [0.6],
+                'colsample_bytree': [0.6],
+                'gamma': [0, 0.1],
+                'min_child_weight': [1],
+                'reg_alpha': [0, 0.1],
+                'reg_lambda': [1],
+                'objective': ['binary:logistic'] if purpose == "classification" else ['reg:squarederror']
             },
             "KNN": (
                 {
@@ -2551,6 +2689,14 @@ def predict(
                 "random_state": [random_state],
                 "learning_rate": ["constant"],
             },
+            "TheilSen":{'max_iter': [100],
+                        'tol': [1e-4],
+                        'n_subsamples': [100+x_train.shape[1]]},
+            "Huber":{'epsilon': [1.35],
+                        'alpha': [0.1],
+                        'max_iter': [100],},
+            "Poisson":{'alpha': [0.1],
+                        'max_iter': [100],}
         }
     elif cv_level in ["high", "advanced", "h"]:
         param_grids = {
@@ -2612,12 +2758,30 @@ def predict(
                 "subsample": [0.8, 1.0],
             },
             "XGBoost": {
-                "n_estimators": [100, 200, 500, 700],
-                "max_depth": [3, 5, 7, 10],
-                "learning_rate": [0.01, 0.1, 0.2, 0.3],
-                "subsample": [0.8, 1.0],
-                "colsample_bytree": [0.8, 0.9, 1.0],
-            },
+                'learning_rate': [0.01, 0.1, 0.2, 0.3],
+                'max_depth': [3, 5, 7, 10],
+                'n_estimators': [50, 100, 200, 300],
+                'subsample': [0.6, 0.8, 1.0],
+                'gamma': [0, 0.1, 0.2, 0.5],
+                'min_child_weight': [1, 5, 10],
+                'reg_alpha': [0, 0.1, 0.5, 1],
+                'reg_lambda': [1, 1.5, 2],
+                **{
+                'objective': ['binary:logistic', 'multi:softmax', 'multi:softprob'],
+                }} if purpose== "classification"
+            else{
+                'learning_rate': [0.01, 0.1, 0.2, 0.3],
+                'max_depth': [3, 5, 7, 10],
+                'n_estimators': [50, 100, 200, 300],
+                'subsample': [0.6, 0.8, 1.0],
+                'colsample_bytree': [0.6, 0.8, 1.0],
+                'gamma': [0, 0.1, 0.2, 0.5],
+                'min_child_weight': [1, 5, 10],
+                'reg_alpha': [0, 0.1, 0.5, 1],
+                'reg_lambda': [1, 1.5, 2],
+                 **{
+                    'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:gamma'],
+                }},
             "KNN": (
                 {
                     "n_neighbors": [1, 3, 5, 10, 15, 20],
@@ -2730,6 +2894,14 @@ def predict(
                     ],  # If True, the regressors X will be normalized
                 }
             ),
+            "TheilSen":{'max_iter': [100, 200, 300],
+                        'tol': [1e-4, 1e-3, 1e-2],
+                        'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1], 300+x_train.shape[1]]},
+            "Huber":{'epsilon': [1.35, 1.5, 2.0],
+                        'alpha': [0.1, 1.0, 10.0],
+                        'max_iter': [100, 200, 300],},
+            "Poisson":{'alpha': [0.1, 1.0, 10.0],
+                        'max_iter': [100, 200, 300],}
         }
     else:  # median level
         param_grids = {
@@ -2789,12 +2961,30 @@ def predict(
                 "subsample": [0.8, 1.0],
             },
             "XGBoost": {
-                "n_estimators": [100, 200, 500],
-                "max_depth": [3, 5, 7],
-                "learning_rate": [0.01, 0.1, 0.2],
-                "subsample": [0.8, 1.0],
-                "colsample_bytree": [0.8, 1.0],
-            },
+                'learning_rate': [0.01, 0.1],
+                'max_depth': [3, 5],
+                'n_estimators': [50, 100],
+                'subsample': [0.6, 0.8],
+                'gamma': [0, 0.1],
+                'min_child_weight': [1, 5],
+                'reg_alpha': [0, 0.1],
+                'reg_lambda': [1,],
+                **{
+                'objective': ['binary:logistic', 'multi:softmax'],
+                }} if purpose== "classification"
+                else{
+                    'learning_rate': [0.01, 0.1],
+                    'max_depth': [3, 5,],
+                    'n_estimators': [50, 100],
+                    'subsample': [0.6, 0.8],
+                    'colsample_bytree': [0.6, 0.8],
+                    'gamma': [0, 0.1],
+                    'min_child_weight': [1, 5],
+                    'reg_alpha': [0, 0.1],
+                    'reg_lambda': [1, 1.5],
+                    **{
+                        'objective': ['reg:squarederror', 'reg:squaredlogerror'],
+                    }},
             "KNN": (
                 {
                     "n_neighbors": [3, 5, 7, 10],
@@ -2951,6 +3141,14 @@ def predict(
                     ],  # Solver for optimization
                 }
             ),
+            "TheilSen":{'max_iter': [100, 200],
+                        'tol': [1e-4, 1e-3],
+                        'n_subsamples': [100+x_train.shape[1], 200+x_train.shape[1]]},
+            "Huber":{'epsilon': [1.35, 1.5],
+                        'alpha': [0.1, 1.0],
+                        'max_iter': [100, 200],},
+            "Poisson":{'alpha': [0.1, 1.0],
+                        'max_iter': [100, 200],}
         }
     results = {}
@@ -3191,12 +3389,18 @@ def predict(
     # Convert results to DataFrame
     df_results = pd.DataFrame.from_dict(results, orient="index")
     # sort
-    if y_true is not None and purpose == "classification":
-        df_scores = pd.DataFrame(
-            df_results["scores"].tolist(), index=df_results["scores"].index
-        ).sort_values(by="roc_auc", ascending=False)
+    if y_true is not None:
+        if purpose == "classification":
+            df_scores = pd.DataFrame(
+                df_results["scores"].tolist(), index=df_results["scores"].index
+            ).sort_values(by="roc_auc", ascending=False)
+        elif purpose=='regression':
+            df_scores = rank_models_reg(
+            pd.DataFrame(df_results["scores"].tolist(), index=df_results["scores"].index),
+            ascending=False)
         df_results = df_results.loc[df_scores.index]
+    if y_true is not None and purpose == "classification":
         if plot_:
             from datetime import datetime
@@ -3214,18 +3418,503 @@ def predict(
                 plot.figsets(xangle=30)
                 if dir_save:
                     ips.figsave(dir_save + f"scores_clus{now_}.pdf")
+    # if all([plot_, y_true is not None, purpose == "classification"]):
+    #     # try:
+    #     if len(models) > 3:
+    #         plot_validate_features(df_results, is_binary=is_binary)
+    #     else:
+    #         plot_validate_features_single(df_results, is_binary=is_binary)
+    #     if dir_save:
+    #         ips.figsave(dir_save + f"validate_features{now_}.pdf")
+    #     # except Exception as e:
+    #     #     print(f"Error: 在画图的过程中出现了问题:{e}")
+    if stack:
+        #! stacking classifier/regressor
+        from sklearn.metrics import make_scorer, accuracy_score
+        from sklearn.model_selection import cross_val_score
+        #* n_top_models防止超过index
+        n_top_models = min(n_top_models, df_results.shape[0])
+        #* 选择出排名靠前的n个, estimators
+        models_selecte = select_top_models(models=list(df_results.index),
+                                        categories=models_support[purpose],
+                                        n_top_models=n_top_models,
+                                        n_models_per_category=n_models_per_category)
+        top_models = df_results.loc[models_selecte]["best_clf"]
+        base_estimators = []
+        for i, j in top_models.to_dict().items():
+            base_estimators.append((i, j))
+        if stacking_cv:
+            print(f" ⤵ stacking_cv is processing...")
+            #* 定义几个象征性的final_estimator
+            # 备选的几种
+            if purpose == "classification":
+                kadt_estimators=["XGBoost","SVM","Logistic Regression","Neural Network"]
+            else:
+                kadt_estimators=["XGBoost","LassoCV"]
+            final_estimators={}
+            for name in kadt_estimators:
+                param_grid=param_grids.get(name, {})
+                print(param_grid)
+                if is_binary:
+                    gs = GridSearchCV(
+                        model_[name],
+                        param_grid=param_grid,
+                        scoring=(
+                            "roc_auc"
+                            if purpose == "classification"
+                            else "neg_mean_squared_error"
+                        ),
+                        cv=cv,
+                        n_jobs=n_jobs,
+                        verbose=verbose,
+                    )
+                else:
+                    gs = GridSearchCV(
+                        model_[name],
+                        param_grid=param_grid,
+                        scoring=(
+                            "roc_auc_ovr"
+                            if purpose == "classification"
+                            else "neg_mean_squared_error"
+                        ),
+                        cv=cv,
+                        n_jobs=n_jobs,
+                        verbose=verbose,
+                    )
+                # Fit GridSearchCV
+                gs.fit(x_train, y_train)
+                final_estimators[name]=gs.best_estimator_
+            #* Set up cross-validation and performance evaluation
+            scorer = make_scorer(accuracy_score)
+            cv_results = []
+            #*Cross-validate stacking models with different final estimators
+            for final_name, final_estimator in final_estimators.items():
+                print(f"Evaluating Stacking Classifier with {final_name} as final estimator...")
+                if purpose == "classification":
+                    stacking_model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator,cv=cv)
+                else:
+                    stacking_model = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, cv=cv)
+                scores = cross_val_score(stacking_model, x_train, y_train, cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state), scoring=scorer)
+                # Store the result
+                cv_results.append({
+                    'final_estimator':final_estimator,
+                    'Final Estimator': final_name,
+                    'Mean Accuracy': np.mean(scores),
+                    'Standard Deviation': np.std(scores)
+                })
+            #* Convert the results into a DataFrame for easy comparison
+            cv_results_df = pd.DataFrame(cv_results)
+            #* Sort and display the best model
+            cv_results_df = cv_results_df.sort_values(by='Mean Accuracy', ascending=False)
+            # Optionally: Select the final estimator that gives the best performance
+            best_final_estimator = cv_results_df.iloc[0]['final_estimator']
+            print(f"Best final estimator based on cross-validation: {best_final_estimator}")
+        else:
+            print(f" ⤵ trying to find the best_final_estimator for stacking...")
+            if purpose=="classification":
+                best_final_estimator = LogisticRegression(class_weight=class_weight,
+                                                        random_state=random_state,
+                                                        max_iter=1000)
+            else:
+                best_final_estimator = RidgeCV(cv=5)
+        print(f"⤵ the best best_final_estimator: {best_final_estimator}")
+        #! apply stacking
+        if purpose == "classification":
+            print(f" ⤵ StackingClassifier...")
+            stacking_model = StackingClassifier(estimators=base_estimators,
+                                                final_estimator=best_final_estimator,
+                                                cv=cv)
+        else:
+            print(f" ⤵ StackingRegressor...")
+            stacking_model = StackingRegressor(estimators=base_estimators,
+                                            final_estimator=best_final_estimator,
+                                            cv=cv)
+        # Train the Stacking Classifier
+        print(f" ⤵ fit & predict...")
+        stacking_model.fit(x_train, y_train)
+        y_pred_final = stacking_model.predict(x_true)
+        print(f" ⤵ collecting results...")
+        # pred_proba
+        if is_binary:
+            if hasattr(stacking_model, "predict_proba"):
+                y_pred_proba_final = stacking_model.predict_proba(x_true)
+                print("Shape of predicted probabilities:", y_pred_proba_final.shape)
+                if y_pred_proba_final.shape[1] == 1:
+                    y_pred_proba_final = np.hstack(
+                        [1 - y_pred_proba_final, y_pred_proba_final]
+                    )  # Add missing class probabilities
+                y_pred_proba_final = y_pred_proba_final[:, 1]
+            elif hasattr(stacking_model, "decision_function"):
+                # If predict_proba is not available, use decision_function (e.g., for SVM)
+                y_pred_proba_final = stacking_model.decision_function(x_true)
+                # Ensure y_pred_proba_final is within 0 and 1 bounds
+                y_pred_proba_final = (y_pred_proba_final - y_pred_proba_final.min()) / (
+                    y_pred_proba_final.max() - y_pred_proba_final.min()
+                )
+            else:
+                y_pred_proba_final = None  # No probability output for certain models
+        if not is_binary:
+            # Handle prediction probabilities for multiclass
+            if hasattr(stacking_model, "predict_proba"):
+                y_pred_proba_final = stacking_model.predict_proba(x_true)
+            elif hasattr(stacking_model, "decision_function"):
+                y_pred_proba_final = stacking_model.decision_function(x_true)
+                # Normalize for multiclass if necessary
+                if y_pred_proba_final.ndim == 2:
+                    y_pred_proba_final = (
+                        y_pred_proba_final - y_pred_proba_final.min(axis=1, keepdims=True)
+                    ) / (
+                        y_pred_proba_final.max(axis=1, keepdims=True)
+                        - y_pred_proba_final.min(axis=1, keepdims=True)
+                    )
+            else:
+                y_pred_proba_final = None  # No probability output for certain models
+        #! dict_pred_stack
+        dict_pred_stack={}
+        validation_scores_final = {}
+        if y_true is not None and y_pred_proba_final is not None:
+            validation_scores_final = cal_metrics(
+                y_true,
+                y_pred_final,
+                y_pred_proba=y_pred_proba_final,
+                is_binary=is_binary,
+                purpose=purpose,
+                average="weighted",
+            )
+            if is_binary:
+                # Calculate ROC curve
+                # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
+                if y_pred_proba_final is not None:
+                    fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
+                    lower_ci, upper_ci = cal_auc_ci(
+                        y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
+                    )
+                    roc_auc = auc(fpr, tpr)
+                    roc_info = {
+                        "fpr": fpr.tolist(),
+                        "tpr": tpr.tolist(),
+                        "auc": roc_auc,
+                        "ci95": (lower_ci, upper_ci),
+                    }
+                    # precision-recall curve
+                    precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_final)
+                    avg_precision_ = average_precision_score(y_true, y_pred_proba_final)
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                if purpose == "classification":
+                    dict_pred_stack = {
+                        "best_clf": stacking_model,
+                        "best_params": None,
+                        "auc_indiv": None,
+                        "scores": validation_scores_final,
+                        "roc_curve": roc_info,
+                        "pr_curve": pr_info,
+                        "confusion_matrix": confusion_matrix(y_true, y_pred_final),
+                        "predictions": y_pred_final.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
+                        ),
+                    }
+                else:  # "regression"
+                    dict_pred_stack = {
+                        "best_clf": stacking_model,
+                        "best_params": None,
+                        "scores": validation_scores_final,  # e.g., neg_MSE, R², etc.
+                        "predictions": y_pred_final.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
+                        ),
+                    }
+            else:  # multi-classes
+                if y_pred_proba_final is not None:
+                    # fpr, tpr, roc_auc = dict(), dict(), dict()
+                    # fpr, tpr, _ = roc_curve(y_true, y_pred_proba_final)
+                    confidence_intervals = cal_auc_ci(
+                        y_true, y_pred_proba_final, verbose=False, is_binary=is_binary
+                    )
+                    roc_info = {
+                        "fpr": validation_scores_final["fpr"],
+                        "tpr": validation_scores_final["tpr"],
+                        "auc": validation_scores_final["roc_auc_by_class"],
+                        "ci95": confidence_intervals,
+                    }
+                    # precision-recall curve
+                    precision_, recall_, avg_precision_ = cal_precision_recall(
+                        y_true, y_pred_proba_final, is_binary=is_binary
+                    )
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                if purpose == "classification":
+                    dict_pred_stack = {
+                        "best_clf": stacking_model,
+                        "best_params": None,
+                        "auc_indiv": None,
+                        "scores": validation_scores_final,
+                        "roc_curve": roc_info,
+                        "pr_curve": pr_info,
+                        "confusion_matrix": confusion_matrix(y_true, y_pred_final),
+                        "predictions": y_pred_final.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
+                        ),
+                    }
+                else:  # "regression"
+                    dict_pred_stack = {
+                        "best_clf": stacking_model,
+                        "best_params": None,
+                        "scores": validation_scores_final,  # e.g., neg_MSE, R², etc.
+                        "predictions": y_pred_final.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
+                        ),
+                    }
+        else:
+            if y_true is None:
+                validation_scores_final = []
+            else:
+                validation_scores_final = cal_metrics(
+                    y_true,
+                    y_pred,
+                    y_pred_proba=y_pred_proba_final,
+                    is_binary=is_binary,
+                    purpose=purpose,
+                    average="weighted",
+                )
+            dict_pred_stack = {
+                "best_clf": stacking_model,
+                "best_params": None,
+                "scores": validation_scores_final,
+                "predictions": y_pred_final.tolist(),
+                "predictions_proba": (
+                    y_pred_proba_final.tolist() if y_pred_proba_final is not None else None
+                ),
+                "y_train": y_train if y_train is not None else [],
+                "y_true": y_true if y_true is not None else [],
+            }
+        # merge together
+        df_pred = pd.DataFrame(
+        [None] * len(df_results.columns), index=df_results.columns, columns=["stack"]).T
+        for k, v in dict_pred_stack.items():
+            if k in df_pred.columns:
+                df_pred[k] = [v]
+        # # plot the stacking
+        # if all([plot_, y_true is not None, purpose == "classification"]):
+        #     plot_validate_features_single(df_pred, is_binary=is_binary)
+        #     if dir_save:
+        #         ips.figsave(dir_save + f"validate_features_stacking_{now_}.pdf")
+    if vote:
+        print(f" ⤵ voting...")
+        from sklearn.ensemble import VotingClassifier, VotingRegressor
+        #! Votting
+        n_top_models = min(n_top_models, df_results.shape[0])
+        base_estimators=[]
+        for name, cls in zip(list(df_results.iloc[:n_top_models, :].index),df_results.iloc[:n_top_models, :]["best_clf"].tolist()):
+            base_estimators.append((name,cls))
+        # Apply Voting Classifier/Regressor
+        if purpose == "classification":
+            print(f" ⤵ VotingClassifier...via{votting}")
+            if voting=='hard':
+                # Hard voting does not support `predict_proba`
+                voting_model = VotingClassifier(estimators=base_estimators)
+            else:
+                # Soft voting supports `predict_proba`
+                voting_model = VotingClassifier(estimators=base_estimators, voting="soft")
+        else:
+            print(f" ⤵ VotingRegressor...")
+            voting_model = VotingRegressor(estimators=base_estimators)
+        # Train the Voting Classifier/Regressor
+        try:
+            voting_model.fit(x_train, y_train)
+            y_pred_vote = voting_model.predict(x_true)
+        except Exception as e:
+            if purpose == "classification" and not voting=='hard':
+                voting_model = VotingClassifier(estimators=base_estimators)
+                voting_model.fit(x_train, y_train)
+                y_pred_vote = voting_model.predict(x_true)
+        # Calculate predicted probabilities if applicable
+        if purpose == "classification":
+            if hasattr(voting_model, "predict_proba"):
+                y_pred_proba_vote = voting_model.predict_proba(x_true)
+                print("Shape of predicted probabilities:", y_pred_proba_vote.shape)
+                if y_pred_proba_vote.shape[1] == 1:
+                    y_pred_proba_vote = np.hstack(
+                        [1 - y_pred_proba_vote, y_pred_proba_vote]
+                    )  # Add missing class probabilities
+                y_pred_proba_vote = y_pred_proba_vote[:, 1]
+            else:
+                y_pred_proba_vote = None
+        else:  # Regression
+            y_pred_proba_vote = None
+        print(f" ⤵ collecting voting results...")
+        #! dict_pred_vote
+        dict_pred_vote = {}
+        validation_scores_vote = {}
+        if y_true is not None and y_pred_proba_vote is not None:
+            validation_scores_vote = cal_metrics(
+                y_true,
+                y_pred_vote,
+                y_pred_proba=y_pred_proba_vote,
+                is_binary=is_binary,
+                purpose=purpose,
+                average="weighted",
+            )
+            if is_binary:
+                if y_pred_proba_vote is not None:
+                    fpr, tpr, _ = roc_curve(y_true, y_pred_proba_vote)
+                    lower_ci, upper_ci = cal_auc_ci(
+                        y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
+                    )
+                    roc_auc = auc(fpr, tpr)
+                    roc_info = {
+                        "fpr": fpr.tolist(),
+                        "tpr": tpr.tolist(),
+                        "auc": roc_auc,
+                        "ci95": (lower_ci, upper_ci),
+                    }
+                    precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba_vote)
+                    avg_precision_ = average_precision_score(y_true, y_pred_proba_vote)
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                dict_pred_vote = {
+                    "best_clf": voting_model,
+                    "best_params": None,
+                    "auc_indiv": None,
+                    "scores": validation_scores_vote,
+                    "roc_curve": roc_info,
+                    "pr_curve": pr_info,
+                    "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
+                    "predictions": y_pred_vote.tolist(),
+                    "predictions_proba": (
+                        y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
+                    ),
+                }
+            else:  # Multi-class
+                if y_pred_proba_vote is not None:
+                    confidence_intervals = cal_auc_ci(
+                        y_true, y_pred_proba_vote, verbose=False, is_binary=is_binary
+                    )
+                    roc_info = {
+                        "fpr": validation_scores_vote["fpr"],
+                        "tpr": validation_scores_vote["tpr"],
+                        "auc": validation_scores_vote["roc_auc_by_class"],
+                        "ci95": confidence_intervals,
+                    }
+                    precision_, recall_, avg_precision_ = cal_precision_recall(
+                        y_true, y_pred_proba_vote, is_binary=is_binary
+                    )
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                dict_pred_vote = {
+                    "best_clf": voting_model,
+                    "best_params": None,
+                    "scores": validation_scores_vote,
+                    "roc_curve": roc_info,
+                    "pr_curve": pr_info,
+                    "confusion_matrix": confusion_matrix(y_true, y_pred_vote),
+                    "predictions": y_pred_vote.tolist(),
+                    "predictions_proba": (
+                        y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
+                    ),
+                }
+        else:
+            if y_true is None:
+                validation_scores_vote = []
+            else:
+                validation_scores_vote = cal_metrics(
+                    y_true,
+                    y_pred,
+                    y_pred_proba=y_pred_proba_vote,
+                    is_binary=is_binary,
+                    purpose=purpose,
+                    average="weighted",
+                )
+            dict_pred_vote = {
+                "best_clf": voting_model,
+                "best_params": None,
+                "scores": validation_scores_vote,
+                "predictions": y_pred_vote.tolist(),
+                "predictions_proba": (
+                    y_pred_proba_vote.tolist() if y_pred_proba_vote is not None else None
+                ),
+                "y_train": y_train if y_train is not None else [],
+                "y_true": y_true if y_true is not None else [],
+            }
+        df_vote = pd.DataFrame(
+        [None] * len(df_results.columns), index=df_results.columns, columns=["vote"]).T
+        for k, v in dict_pred_vote.items():
+            if k in df_vote.columns:
+                df_vote[k] = [v]
+        # if all([plot_, y_true is not None, purpose == "classification"]):
+        #     try:
+        #         plot_validate_features_single(df_vote, is_binary=is_binary)
+        #         if dir_save:
+        #             ips.figsave(dir_save + f"validate_features_vote_{now_}.pdf")
+        #     except Exception as e:
+        #         print(e)
+    print("Done")
+    if vote and stack:
+        df_res=pd.concat([df_pred,df_vote, df_results],ignore_index=False,axis=0)
+    elif vote:
+        df_res=pd.concat([df_vote, df_results],ignore_index=False,axis=0)
+    elif stack:
+        df_res=pd.concat([df_pred,df_results],ignore_index=False,axis=0)
     if all([plot_, y_true is not None, purpose == "classification"]):
+        from datetime import datetime
+        now_ = datetime.now().strftime("%y%m%d_%H%M%S")
         # try:
-        if len(models) > 3:
-            plot_validate_features(df_results, is_binary=is_binary)
+        if df_res.shape[0] > 3:
+            plot_validate_features(df_res, is_binary=is_binary)
         else:
-            plot_validate_features_single(df_results, is_binary=is_binary)
+            plot_validate_features_single(df_res, is_binary=is_binary)
         if dir_save:
             ips.figsave(dir_save + f"validate_features{now_}.pdf")
-        # except Exception as e:
-        #     print(f"Error: 在画图的过程中出现了问题:{e}")
-    return df_results
+    # except Exception as e:
+    #     print(f"Error: 在画图的过程中出现了问题:{e}")
+    return df_res
 def cal_metrics(
     y_true,
@@ -3367,7 +4056,7 @@ def cal_metrics(
 def plot_trees(
-    X, y, cls, max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
+    X, y, cls:str='random', max_trees=500, test_size=0.2, random_state=42, early_stopping_rounds=None
 ):
     """
     # # Example usage:
@@ -3413,10 +4102,14 @@ def plot_trees(
     train_error_rate = []
     test_error_rate = []
     validation_error = None
+    if isinstance(cls, str):
+        cls=ips.strcmp(cls, ["RandomForestClassifier","ExtraTreesClassifier","AdaBoostClassifier","GradientBoostingClassifier"])
     # Configure classifier based on type
     oob_enabled = False  # Default to no OOB error unless explicitly set
+    clf_support = {"RandomForestClassifier":RandomForestClassifier(),
+                    "ExtraTreesClassifier":ExtraTreesClassifier(),
+                    "AdaBoostClassifier":AdaBoostClassifier(),
+                    "GradientBoostingClassifier":GradientBoostingClassifier()}
     if isinstance(cls, (RandomForestClassifier, ExtraTreesClassifier)):
         # Enable OOB if cls supports it and is using bootstrapping
         cls.set_params(warm_start=True, n_estimators=1)
@@ -3678,7 +4371,7 @@ def img_datasets_preprocessing(
 def backward_regression(
-    X: pd.DataFrame, y: pd.Series, initial_list=[], threshold_out=0.05, verbose=True
+    X: pd.DataFrame, y: pd.Series, initial_list=[], thr=0.05, verbose=True
 ):
     """
     # awesome bit of code from https://www.kaggle.com/code/adibouayjan/house-price-step-by-step-modeling
@@ -3690,31 +4383,46 @@ def backward_regression(
         X -- features values
         y -- target variable
         initial_list -- features header
-        threshold_out -- pvalue threshold of features to drop
+        thr -- pvalue threshold of features to drop
         verbose -- true to produce lots of logging output
     Returns:
         list of selected features for modeling
     """
     import statsmodels.api as sm
-    if isinstance(y, str) and y in X.columns:
-        y_col_name = y
-        y = X[y]
-        X = X.drop(y_col_name, axis=1)
+    if isinstance(y, str):
+        if y in X.columns:
+            y_col_name = y
+            y = X[y]
+            X = X.drop(y_col_name, axis=1)
+        else:
+            raise ValueError(f"找不到{y},y设置有误")
+    X = X.select_dtypes(include=[np.number])
     included = list(X.columns)
+    try:
+        X=X.astype(float)
+        y=y.astype(float)
+    except Exception as e:
+        raise ValueError(f"无法把数据类型转换成float类型,因而无法进一步进行统计分析: {e}")
     while True:
         changed = False
+        if not included:
+            print("No features remain in the model.")
+            break
         model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
         # exclude the intercept for p-value checking
         pvalues = model.pvalues.iloc[1:]
         worst_pval = pvalues.max()
-        if worst_pval > threshold_out:
+        if worst_pval > thr:
             changed = True
             worst_feature = pvalues.idxmax()
             included.remove(worst_feature)
             if verbose:
-                print(f"Removing feature '{worst_feature}' with p-value {worst_pval}")
+                print(f"Removing '{worst_feature}' with p-value={round(worst_pval,2)}")
         if not changed:
             break
     print(f"\nSelected Features:\n{included}")

py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

py2ls 0.2.4.22py3-none-any.whl → 0.2.4.24py3-none-any.whl