PyPI - py2ls - Versions diffs - 0.2.4.14__py3-none-any.whl → 0.2.4.15__py3-none-any.whl - Mend

py2ls 0.2.4.14py3-none-any.whl → 0.2.4.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

py2ls/ml2ls.py +562 -240
py2ls/translator.py +2 -0
{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.15.dist-info}/METADATA +1 -1
{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.15.dist-info}/RECORD +5 -5
{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.15.dist-info}/WHEEL +0 -0

py2ls/ml2ls.py CHANGED Viewed

@@ -616,10 +616,10 @@ def get_features(
     if isinstance(y, str) and y in X.columns:
         y_col_name = y
         y = X[y]
-        y = ips.df_encoder(pd.DataFrame(y), method="dummy")
+        y = ips.df_encoder(pd.DataFrame(y), method="label")
         X = X.drop(y_col_name, axis=1)
     else:
-        y = ips.df_encoder(pd.DataFrame(y), method="dummy").values.ravel()
+        y = ips.df_encoder(pd.DataFrame(y), method="label").values.ravel()
     y = y.loc[X.index]  # Align y with X after dropping rows with missing values in X
     y = y.ravel() if isinstance(y, np.ndarray) else y.values.ravel()
@@ -1217,142 +1217,335 @@ def validate_features(
 # # If you want to access validation scores
 # print(validation_results)
-def plot_validate_features(res_val):
+def plot_validate_features(res_val,is_binary=True,figsize=None):
     """
     plot the results of 'validate_features()'
     """
-    colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
-    if res_val.shape[0] > 5:
-        alpha = 0
-        figsize = [8, 10]
-        subplot_layout = [1, 2]
-        ncols = 2
-        bbox_to_anchor = [1.5, 0.6]
-    else:
-        alpha = 0.03
-        figsize = [10, 6]
-        subplot_layout = [1, 1]
-        ncols = 1
-        bbox_to_anchor = [1, 1]
-    nexttile = plot.subplot(figsize=figsize)
-    ax = nexttile(subplot_layout[0], subplot_layout[1])
-    for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-        fpr = res_val["roc_curve"][model_name]["fpr"]
-        tpr = res_val["roc_curve"][model_name]["tpr"]
-        (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
-        mean_auc = res_val["roc_curve"][model_name]["auc"]
-        plot_roc_curve(
-            fpr,
-            tpr,
-            mean_auc,
-            lower_ci,
-            upper_ci,
-            model_name=model_name,
-            lw=1.5,
-            color=colors[i],
-            alpha=alpha,
-            ax=ax,
+    if is_binary:
+        colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
+        if res_val.shape[0] > 5:
+            alpha = 0
+            figsize = [8, 10] if figsize is None else figsize
+            subplot_layout = [1, 2]
+            ncols = 2
+            bbox_to_anchor = [1.5, 0.6]
+        else:
+            alpha = 0.03
+            figsize = [10, 6] if figsize is None else figsize
+            subplot_layout = [1, 1]
+            ncols = 1
+            bbox_to_anchor = [1, 1]
+        nexttile = plot.subplot(figsize=figsize)
+        ax = nexttile(subplot_layout[0], subplot_layout[1])
+        for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+            fpr = res_val["roc_curve"][model_name]["fpr"]
+            tpr = res_val["roc_curve"][model_name]["tpr"]
+            (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
+            mean_auc = res_val["roc_curve"][model_name]["auc"]
+            plot_roc_curve(
+                fpr,
+                tpr,
+                mean_auc,
+                lower_ci,
+                upper_ci,
+                model_name=model_name,
+                lw=1.5,
+                color=colors[i],
+                alpha=alpha,
+                ax=ax,
+            )
+        plot.figsets(
+            sp=2,
+            legend=dict(
+                loc="upper right",
+                ncols=ncols,
+                fontsize=8,
+                bbox_to_anchor=[1.5, 0.6],
+                markerscale=0.8,
+            ),
         )
-    plot.figsets(
-        sp=2,
-        legend=dict(
-            loc="upper right",
-            ncols=ncols,
-            fontsize=8,
-            bbox_to_anchor=[1.5, 0.6],
-            markerscale=0.8,
-        ),
-    )
-    # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
-    ax = nexttile(subplot_layout[0], subplot_layout[1])
-    for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
-        plot_pr_curve(
-            recall=res_val["pr_curve"][model_name]["recall"],
-            precision=res_val["pr_curve"][model_name]["precision"],
-            avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
-            model_name=model_name,
-            color=colors[i],
-            lw=1.5,
-            alpha=alpha,
-            ax=ax,
+        # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+        ax = nexttile(subplot_layout[0], subplot_layout[1])
+        for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+            plot_pr_curve(
+                recall=res_val["pr_curve"][model_name]["recall"],
+                precision=res_val["pr_curve"][model_name]["precision"],
+                avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
+                model_name=model_name,
+                color=colors[i],
+                lw=1.5,
+                alpha=alpha,
+                ax=ax,
+            )
+        plot.figsets(
+            sp=2,
+            legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
         )
-    plot.figsets(
-        sp=2,
-        legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
-    )
-    # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+        # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+    else:
+        colors = plot.get_color(len(ips.flatten(res_val["pr_curve"].index)))
+        modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
+        classes=list(res_val["roc_curve"][modname_tmp]['fpr'].keys())
+        if res_val.shape[0] > 5:
+            alpha = 0
+            figsize = [8, 8*2*(len(classes))]  if figsize is None else figsize
+            subplot_layout = [1, 2]
+            ncols = 2
+            bbox_to_anchor = [1.5, 0.6]
+        else:
+            alpha = 0.03
+            figsize = [10, 6*(len(classes))] if figsize is None else figsize
+            subplot_layout = [1, 1]
+            ncols = 1
+            bbox_to_anchor = [1, 1]
+        nexttile = plot.subplot(2*(len(classes)),2,figsize=figsize)
+        for iclass, class_ in enumerate(classes):
+            ax = nexttile(subplot_layout[0], subplot_layout[1])
+            for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+                fpr = res_val["roc_curve"][model_name]["fpr"][class_]
+                tpr = res_val["roc_curve"][model_name]["tpr"][class_]
+                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
+                mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
+                plot_roc_curve(
+                    fpr,
+                    tpr,
+                    mean_auc,
+                    lower_ci,
+                    upper_ci,
+                    model_name=model_name,
+                    lw=1.5,
+                    color=colors[i],
+                    alpha=alpha,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=class_,
+                legend=dict(
+                    loc="upper right",
+                    ncols=ncols,
+                    fontsize=8,
+                    bbox_to_anchor=[1.5, 0.6],
+                    markerscale=0.8,
+                ),
+            )
+            # plot.split_legend(ax,n=2, loc=["upper left", "lower left"],bbox=[[1,0.5],[1,0.5]],ncols=2,labelcolor="k",fontsize=8)
+            ax = nexttile(subplot_layout[0], subplot_layout[1])
+            for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+                plot_pr_curve(
+                    recall=res_val["pr_curve"][model_name]["recall"][iclass],
+                    precision=res_val["pr_curve"][model_name]["precision"][iclass],
+                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"][iclass],
+                    model_name=model_name,
+                    color=colors[i],
+                    lw=1.5,
+                    alpha=alpha,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=class_,
+                legend=dict(loc="upper right", ncols=1, fontsize=8, bbox_to_anchor=[1.5, 0.5]),
+            )
+def plot_validate_features_single(res_val, figsize=None,is_binary=True):
+    if is_binary:
+        if figsize is None:
+            nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3,figsize=[13,4*len(ips.flatten(res_val["pr_curve"].index))])
+        else:
+            nexttile = plot.subplot(
+                len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
+            )
+        for model_name in ips.flatten(res_val["pr_curve"].index):
+            fpr = res_val["roc_curve"][model_name]["fpr"]
+            tpr = res_val["roc_curve"][model_name]["tpr"]
+            (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
+            mean_auc = res_val["roc_curve"][model_name]["auc"]
+            # Plotting
+            plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci,
+                model_name=model_name, ax=nexttile())
+            plot.figsets(title=model_name, sp=2)
+            plot_pr_binary(
+                recall=res_val["pr_curve"][model_name]["recall"],
+                precision=res_val["pr_curve"][model_name]["precision"],
+                avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
+                model_name=model_name,
+                ax=nexttile(),
+            )
+            plot.figsets(title=model_name, sp=2)
-def plot_validate_features_single(res_val, figsize=None):
-    if figsize is None:
-        nexttile = plot.subplot(len(ips.flatten(res_val["pr_curve"].index)), 3)
+            # plot cm
+            plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
+            plot.figsets(title=model_name, sp=2)
     else:
-        nexttile = plot.subplot(
-            len(ips.flatten(res_val["pr_curve"].index)), 3, figsize=figsize
-        )
-    for model_name in ips.flatten(res_val["pr_curve"].index):
-        fpr = res_val["roc_curve"][model_name]["fpr"]
-        tpr = res_val["roc_curve"][model_name]["tpr"]
-        (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"]
-        mean_auc = res_val["roc_curve"][model_name]["auc"]
-        # Plotting
-        plot_roc_curve(fpr, tpr, mean_auc, lower_ci, upper_ci,
-            model_name=model_name, ax=nexttile())
-        plot.figsets(title=model_name, sp=2)
-        plot_pr_binary(
-            recall=res_val["pr_curve"][model_name]["recall"],
-            precision=res_val["pr_curve"][model_name]["precision"],
-            avg_precision=res_val["pr_curve"][model_name]["avg_precision"],
-            model_name=model_name,
-            ax=nexttile(),
-        )
-        plot.figsets(title=model_name, sp=2)
+        modname_tmp=ips.flatten(res_val["roc_curve"].index)[0]
+        classes=list(res_val["roc_curve"][modname_tmp]['fpr'].keys())
+        if figsize is None:
+            nexttile = plot.subplot(len(modname_tmp), 3,figsize=[15,len(modname_tmp)*5])
+        else:
+            nexttile = plot.subplot(len(modname_tmp), 3, figsize=figsize)
+        colors = plot.get_color(len(classes))
+        for i, model_name in enumerate(ips.flatten(res_val["pr_curve"].index)):
+            ax = nexttile()
+            for iclass, class_ in enumerate(classes):
+                fpr = res_val["roc_curve"][model_name]["fpr"][class_]
+                tpr = res_val["roc_curve"][model_name]["tpr"][class_]
+                (lower_ci, upper_ci) = res_val["roc_curve"][model_name]["ci95"][iclass]
+                mean_auc = res_val["roc_curve"][model_name]["auc"][class_]
+                plot_roc_curve(
+                    fpr,
+                    tpr,
+                    mean_auc,
+                    lower_ci,
+                    upper_ci,
+                    model_name=class_,
+                    lw=1.5,
+                    color=colors[iclass],
+                    alpha=0.03,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=model_name,
+                legend=dict(
+                    loc="best",
+                    fontsize=8,
+                ),
+            )
+            ax = nexttile()
+            for iclass, class_ in enumerate(classes):
+                plot_pr_curve(
+                    recall=res_val["pr_curve"][model_name]["recall"][iclass],
+                    precision=res_val["pr_curve"][model_name]["precision"][iclass],
+                    avg_precision=res_val["pr_curve"][model_name]["avg_precision"][iclass],
+                    model_name=class_,
+                    color=colors[iclass],
+                    lw=1.5,
+                    alpha=0.03,
+                    ax=ax,
+                )
+            plot.figsets(
+                sp=2,
+                title=class_,
+                legend=dict(loc="best", fontsize=8),
+            )
+            plot_cm(res_val["confusion_matrix"][model_name],labels_name=classes, ax=nexttile(), normalize=False)
+            plot.figsets(title=model_name, sp=2)
-        # plot cm
-        plot_cm(res_val["confusion_matrix"][model_name], ax=nexttile(), normalize=False)
-        plot.figsets(title=model_name, sp=2)
+def cal_precision_recall(
+    y_true, y_pred_proba, is_binary=True):
+    if is_binary:
+        precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
+        avg_precision_ = average_precision_score(y_true, y_pred_proba)
+        return precision_, recall_,avg_precision_
+    else:
+        n_classes = y_pred_proba.shape[1]  # Number of classes
+        precision_ = []
+        recall_ = []
+        # One-vs-rest approach for multi-class precision-recall curve
+        for class_idx in range(n_classes):
+            precision, recall, _ = precision_recall_curve(
+                (y_true == class_idx).astype(int),  # Binarize true labels for the current class
+                y_pred_proba[:, class_idx],  # Probabilities for the current class
+            )
+            precision_.append(precision)
+            recall_.append(recall)
+        # Optionally, you can compute average precision for each class
+        avg_precision_ = []
+        for class_idx in range(n_classes):
+            avg_precision = average_precision_score(
+                (y_true == class_idx).astype(int),  # Binarize true labels for the current class
+                y_pred_proba[:, class_idx],  # Probabilities for the current class
+            )
+            avg_precision_.append(avg_precision)
+        return precision_, recall_,avg_precision_
 def cal_auc_ci(
-    y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1, verbose=True
+    y_true, y_pred, n_bootstraps=1000, ci=0.95, random_state=1,is_binary=True, verbose=True
 ):
-    y_true = np.asarray(y_true)
-    y_pred = np.asarray(y_pred)
-    bootstrapped_scores = []
-    if verbose:
-        print("auroc score:", roc_auc_score(y_true, y_pred))
-    rng = np.random.RandomState(random_state)
-    for i in range(n_bootstraps):
-        # bootstrap by sampling with replacement on the prediction indices
-        indices = rng.randint(0, len(y_pred), len(y_pred))
-        if len(np.unique(y_true[indices])) < 2:
-            # We need at least one positive and one negative sample for ROC AUC
-            # to be defined: reject the sample
-            continue
-        if isinstance(y_true, np.ndarray):
-            score = roc_auc_score(y_true[indices], y_pred[indices])
-        else:
-            score = roc_auc_score(y_true.iloc[indices], y_pred.iloc[indices])
-        bootstrapped_scores.append(score)
-        # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
-    sorted_scores = np.array(bootstrapped_scores)
-    sorted_scores.sort()
-    # Computing the lower and upper bound of the 90% confidence interval
-    # You can change the bounds percentiles to 0.025 and 0.975 to get
-    # a 95% confidence interval instead.
-    confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
-    confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
-    if verbose:
-        print(
-            "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
-                confidence_lower, confidence_upper
+    if is_binary:
+        y_true = np.asarray(y_true)
+        y_pred = np.asarray(y_pred)
+        bootstrapped_scores = []
+        if verbose:
+            print("auroc score:", roc_auc_score(y_true, y_pred))
+        rng = np.random.RandomState(random_state)
+        for i in range(n_bootstraps):
+            # bootstrap by sampling with replacement on the prediction indices
+            indices = rng.randint(0, len(y_pred), len(y_pred))
+            if len(np.unique(y_true[indices])) < 2:
+                # We need at least one positive and one negative sample for ROC AUC
+                # to be defined: reject the sample
+                continue
+            if isinstance(y_true, np.ndarray):
+                score = roc_auc_score(y_true[indices], y_pred[indices])
+            else:
+                score = roc_auc_score(y_true.iloc[indices], y_pred.iloc[indices])
+            bootstrapped_scores.append(score)
+            # print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
+        sorted_scores = np.array(bootstrapped_scores)
+        sorted_scores.sort()
+        # Computing the lower and upper bound of the 90% confidence interval
+        # You can change the bounds percentiles to 0.025 and 0.975 to get
+        # a 95% confidence interval instead.
+        confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
+        confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
+        if verbose:
+            print(
+                "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
+                    confidence_lower, confidence_upper
+                )
             )
-        )
-    return confidence_lower, confidence_upper
+        return confidence_lower, confidence_upper
+    else:
+        from sklearn.preprocessing import label_binarize
+        # Multi-class classification case
+        y_true = np.asarray(y_true)
+        y_pred = np.asarray(y_pred)
+        # Binarize the multi-class labels for OvR computation
+        y_true_bin = label_binarize(y_true, classes=np.unique(y_true))  # One-vs-Rest transformation
+        n_classes = y_true_bin.shape[1]  # Number of classes
+        bootstrapped_scores = np.zeros((n_classes, n_bootstraps))  # Store scores for each class
+        if verbose:
+            print("AUROC scores for each class:")
+            for i in range(n_classes):
+                print(f"Class {i}: {roc_auc_score(y_true_bin[:, i], y_pred[:, i])}")
+        rng = np.random.RandomState(random_state)
+        for i in range(n_bootstraps):
+            indices = rng.randint(0, len(y_pred), len(y_pred))
+            for class_idx in range(n_classes):
+                if len(np.unique(y_true_bin[indices, class_idx])) < 2:
+                    continue  # Reject if the class doesn't have both positive and negative samples
+                score = roc_auc_score(y_true_bin[indices, class_idx], y_pred[indices, class_idx])
+                bootstrapped_scores[class_idx, i] = score
+        # Calculating the confidence intervals for each class
+        confidence_intervals = []
+        for class_idx in range(n_classes):
+            sorted_scores = np.sort(bootstrapped_scores[class_idx])
+            confidence_lower = sorted_scores[int((1 - ci) * len(sorted_scores))]
+            confidence_upper = sorted_scores[int(ci * len(sorted_scores))]
+            confidence_intervals.append((confidence_lower, confidence_upper))
+            if verbose:
+                print(f"Class {class_idx} - Confidence interval: [{confidence_lower:.3f} - {confidence_upper:.3f}]")
+        return confidence_intervals
 def plot_roc_curve(
@@ -1517,7 +1710,7 @@ def plot_pr_binary(
     pr_boundary = interp1d(recall, precision, kind="linear", fill_value="extrapolate")
     for f_score in f_scores:
-        x_vals = np.linspace(0.01, 1, 10000)
+        x_vals = np.linspace(0.01, 1, 20000)
         y_vals = f_score * x_vals / (2 * x_vals - f_score)
         y_vals_clipped = np.minimum(y_vals, pr_boundary(x_vals))
         y_vals_clipped = np.clip(y_vals_clipped, 1e-3, None)  # Prevent going to zero
@@ -1553,7 +1746,7 @@ def plot_pr_binary(
 def plot_cm(
     cm,
     labels_name=None,
-    thresh=0.8,
+    thresh=0.8, # for set color
     axis_labels=None,
     cmap="Reds",
     normalize=True,
@@ -2029,10 +2222,16 @@ def predict(
     if isinstance(y_train, str) and y_train in x_train.columns:
         y_train_col_name = y_train
         y_train = x_train[y_train]
-        y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
+        # y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
         x_train = x_train.drop(y_train_col_name, axis=1)
-    else:
-        y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
+    # else:
+    #     y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy").values.ravel()
+    y_train=pd.DataFrame(y_train)
+    y_train_=ips.df_encoder(y_train, method="dummy",drop=None)
+    is_binary = False if y_train_.shape[1] >2 else True
+    # if is_binary:
+    #     y_train = ips.df_encoder(pd.DataFrame(y_train), method="label").values.ravel()
     if x_true is None:
         x_train, x_true, y_train, y_true = train_test_split(
@@ -2042,23 +2241,27 @@ def predict(
             random_state=random_state,
             stratify=y_train if purpose == "classification" else None,
         )
         if isinstance(y_train, str) and y_train in x_train.columns:
             y_train_col_name = y_train
             y_train = x_train[y_train]
-            y_train = ips.df_encoder(pd.DataFrame(y_train), method="dummy")
+            y_train = ips.df_encoder(pd.DataFrame(y_train), method="label") if is_binary else y_train
             x_train = x_train.drop(y_train_col_name, axis=1)
-        else:
+        if is_binary:
             y_train = ips.df_encoder(
-                pd.DataFrame(y_train), method="dummy"
-            ).values.ravel()
+                pd.DataFrame(y_train), method="label"
+            ).values.ravel()
     if y_true is not None:
         if isinstance(y_true, str) and y_true in x_true.columns:
             y_true_col_name = y_true
             y_true = x_true[y_true]
-            y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy")
+            y_true = ips.df_encoder(pd.DataFrame(y_true), method="label") if is_binary else y_true
+            y_true =  pd.DataFrame(y_true)
             x_true = x_true.drop(y_true_col_name, axis=1)
-        else:
-            y_true = ips.df_encoder(pd.DataFrame(y_true), method="dummy").values.ravel()
+        if is_binary:
+            y_true = ips.df_encoder(pd.DataFrame(y_true), method="label").values.ravel()
+            y_true =  pd.DataFrame(y_true)
     # to convert the 2D to 1D: 2D column-vector format (like [[1], [0], [1], ...]) instead of a 1D array ([1, 0, 1, ...]
@@ -2068,7 +2271,6 @@ def predict(
         y_train.ravel() if isinstance(y_train, np.ndarray) else y_train.values.ravel()
     )
     y_true = y_true.ravel() if isinstance(y_true, np.ndarray) else y_true.values.ravel()
     # Ensure common features are selected
     if common_features is not None:
         x_train, x_true = x_train[common_features], x_true[common_features]
@@ -2077,10 +2279,7 @@ def predict(
         x_train, x_true = x_train[share_col_names], x_true[share_col_names]
     x_train, x_true = ips.df_scaler(x_train), ips.df_scaler(x_true)
-    x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(
-        x_true, method="dummy"
-    )
+    x_train, x_true = ips.df_encoder(x_train, method="dummy"), ips.df_encoder(x_true, method="dummy")
     # Handle class imbalance using SMOTE (only for classification)
     if (
         smote
@@ -2091,7 +2290,13 @@ def predict(
         smote_sampler = SMOTE(random_state=random_state)
         x_train, y_train = smote_sampler.fit_resample(x_train, y_train)
+    if not is_binary:
+        if isinstance(y_train, np.ndarray):
+            y_train = ips.df_encoder(data=pd.DataFrame(y_train),method='label')
+            y_train=np.asarray(y_train)
+        if isinstance(y_train, np.ndarray):
+            y_true = ips.df_encoder(data=pd.DataFrame(y_true),method='label')
+            y_true=np.asarray(y_true)
     # Hyperparameter grids for tuning
     if cv_level in ["low", "simple", "s", "l"]:
         param_grids = {
@@ -2670,95 +2875,177 @@ def predict(
             print(f"\nTraining and validating {name}:")
         # Grid search with KFold or StratifiedKFold
-        gs = GridSearchCV(
-            clf,
-            param_grid=param_grids.get(name, {}),
-            scoring=(
-                "roc_auc" if purpose == "classification" else "neg_mean_squared_error"
-            ),
-            cv=cv,
-            n_jobs=n_jobs,
-            verbose=verbose,
-        )
-        gs.fit(x_train, y_train)
-        best_clf = gs.best_estimator_
-        # make sure x_train and x_test has the same name
-        x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
-        y_pred = best_clf.predict(x_true)
-        # y_pred_proba
-        if hasattr(best_clf, "predict_proba"):
-            y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
-        elif hasattr(best_clf, "decision_function"):
-            # If predict_proba is not available, use decision_function (e.g., for SVM)
-            y_pred_proba = best_clf.decision_function(x_true)
-            # Ensure y_pred_proba is within 0 and 1 bounds
-            y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
-                y_pred_proba.max() - y_pred_proba.min()
+        if is_binary:
+            gs = GridSearchCV(
+                clf,
+                param_grid=param_grids.get(name, {}),
+                scoring=(
+                    "roc_auc" if purpose == "classification" else "neg_mean_squared_error"
+                ),
+                cv=cv,
+                n_jobs=n_jobs,
+                verbose=verbose,
             )
+            gs.fit(x_train, y_train)
+            best_clf = gs.best_estimator_
+            # make sure x_train and x_test has the same name
+            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+            y_pred = best_clf.predict(x_true)
+            if hasattr(best_clf, "predict_proba"):
+                y_pred_proba = best_clf.predict_proba(x_true)[:, 1]
+            elif hasattr(best_clf, "decision_function"):
+                # If predict_proba is not available, use decision_function (e.g., for SVM)
+                y_pred_proba = best_clf.decision_function(x_true)
+                # Ensure y_pred_proba is within 0 and 1 bounds
+                y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (
+                    y_pred_proba.max() - y_pred_proba.min()
+                )
+            else:
+                y_pred_proba = None  # No probability output for certain models
         else:
-            y_pred_proba = None  # No probability output for certain models
+            gs = GridSearchCV(
+                clf,
+                param_grid=param_grids.get(name, {}),
+                scoring=(
+                    "roc_auc_ovr" if purpose == "classification" else "neg_mean_squared_error"
+                ),
+                cv=cv,
+                n_jobs=n_jobs,
+                verbose=verbose,
+            )
+            # Fit GridSearchCV
+            gs.fit(x_train, y_train)
+            best_clf = gs.best_estimator_
+            # Ensure x_true aligns with x_train columns
+            x_true = x_true.reindex(columns=x_train.columns, fill_value=0)
+            y_pred = best_clf.predict(x_true)
+            # Handle prediction probabilities for multiclass
+            if hasattr(best_clf, "predict_proba"):
+                y_pred_proba = best_clf.predict_proba(x_true)
+            elif hasattr(best_clf, "decision_function"):
+                y_pred_proba = best_clf.decision_function(x_true)
+                # Normalize for multiclass if necessary
+                if y_pred_proba.ndim == 2:
+                    y_pred_proba = (y_pred_proba - y_pred_proba.min(axis=1, keepdims=True)) / \
+                                (y_pred_proba.max(axis=1, keepdims=True) - y_pred_proba.min(axis=1, keepdims=True))
+            else:
+                y_pred_proba = None  # No probability output for certain models
         validation_scores = {}
-        if y_true is not None:
+        if y_true is not None and y_pred_proba is not None:
             validation_scores = cal_metrics(
                 y_true,
                 y_pred,
                 y_pred_proba=y_pred_proba,
+                is_binary=is_binary,
                 purpose=purpose,
                 average="weighted",
             )
-            # Calculate ROC curve
-            # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
-            if y_pred_proba is not None:
-                # fpr, tpr, roc_auc = dict(), dict(), dict()
-                fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
-                lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False)
-                roc_auc = auc(fpr, tpr)
-                roc_info = {
-                    "fpr": fpr.tolist(),
-                    "tpr": tpr.tolist(),
-                    "auc": roc_auc,
-                    "ci95": (lower_ci, upper_ci),
-                }
-                # precision-recall curve
-                precision_, recall_, _ = precision_recall_curve(y_true, y_pred_proba)
-                avg_precision_ = average_precision_score(y_true, y_pred_proba)
-                pr_info = {
-                    "precision": precision_,
-                    "recall": recall_,
-                    "avg_precision": avg_precision_,
-                }
-            else:
-                roc_info, pr_info = None, None
-            if purpose == "classification":
-                results[name] = {
-                    "best_clf": gs.best_estimator_,
-                    "best_params": gs.best_params_,
-                    "auc_indiv": [
-                        gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
-                        for i in range(cv_folds)
-                    ],
-                    "scores": validation_scores,
-                    "roc_curve": roc_info,
-                    "pr_curve": pr_info,
-                    "confusion_matrix": confusion_matrix(y_true, y_pred),
-                    "predictions": y_pred.tolist(),
-                    "predictions_proba": (
-                        y_pred_proba.tolist() if y_pred_proba is not None else None
-                    ),
-                }
-            else:  # "regression"
-                results[name] = {
-                    "best_clf": gs.best_estimator_,
-                    "best_params": gs.best_params_,
-                    "scores": validation_scores,  # e.g., neg_MSE, R², etc.
-                    "predictions": y_pred.tolist(),
-                    "predictions_proba": (
-                        y_pred_proba.tolist() if y_pred_proba is not None else None
-                    ),
-                }
+            if is_binary:
+                # Calculate ROC curve
+                # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
+                if y_pred_proba is not None:
+                    # fpr, tpr, roc_auc = dict(), dict(), dict()
+                    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+                    lower_ci, upper_ci = cal_auc_ci(y_true, y_pred_proba, verbose=False,is_binary=is_binary)
+                    roc_auc = auc(fpr, tpr)
+                    roc_info = {
+                        "fpr": fpr.tolist(),
+                        "tpr": tpr.tolist(),
+                        "auc": roc_auc,
+                        "ci95": (lower_ci, upper_ci),
+                    }
+                    # precision-recall curve
+                    precision_, recall_, _ = cal_precision_recall(y_true, y_pred_proba)
+                    avg_precision_ = average_precision_score(y_true, y_pred_proba)
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                if purpose == "classification":
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "auc_indiv": [
+                            gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                            for i in range(cv_folds)
+                        ],
+                        "scores": validation_scores,
+                        "roc_curve": roc_info,
+                        "pr_curve": pr_info,
+                        "confusion_matrix": confusion_matrix(y_true, y_pred),
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
+                else:  # "regression"
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "scores": validation_scores,  # e.g., neg_MSE, R², etc.
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
+            else: # multi-classes
+                if y_pred_proba is not None:
+                    # fpr, tpr, roc_auc = dict(), dict(), dict()
+                    # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+                    confidence_intervals = cal_auc_ci(y_true, y_pred_proba, verbose=False,is_binary=is_binary)
+                    roc_info = {
+                        "fpr": validation_scores["fpr"],
+                        "tpr": validation_scores["tpr"],
+                        "auc": validation_scores["roc_auc_by_class"],
+                        "ci95": confidence_intervals,
+                    }
+                    # precision-recall curve
+                    precision_, recall_, avg_precision_ = cal_precision_recall(y_true, y_pred_proba,is_binary=is_binary)
+                    pr_info = {
+                        "precision": precision_,
+                        "recall": recall_,
+                        "avg_precision": avg_precision_,
+                    }
+                else:
+                    roc_info, pr_info = None, None
+                if purpose == "classification":
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "auc_indiv": [
+                            gs.cv_results_[f"split{i}_test_score"][gs.best_index_]
+                            for i in range(cv_folds)
+                        ],
+                        "scores": validation_scores,
+                        "roc_curve": roc_info,
+                        "pr_curve": pr_info,
+                        "confusion_matrix": confusion_matrix(y_true, y_pred),
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
+                else:  # "regression"
+                    results[name] = {
+                        "best_clf": gs.best_estimator_,
+                        "best_params": gs.best_params_,
+                        "scores": validation_scores,  # e.g., neg_MSE, R², etc.
+                        "predictions": y_pred.tolist(),
+                        "predictions_proba": (
+                            y_pred_proba.tolist() if y_pred_proba is not None else None
+                        ),
+                    }
         else:
             results[name] = {
@@ -2773,7 +3060,6 @@ def predict(
     # Convert results to DataFrame
     df_results = pd.DataFrame.from_dict(results, orient="index")
     # sort
     if y_true is not None and purpose == "classification":
         df_scores = pd.DataFrame(
@@ -2790,26 +3076,29 @@ def predict(
             plot.figsets(xangle=30)
             if dir_save:
                 ips.figsave(dir_save + f"scores_sorted_heatmap{now_}.pdf")
+            df_scores=df_scores.select_dtypes(include=np.number)
+            display(df_scores)
             if df_scores.shape[0] > 1:  # draw cluster
                 plot.heatmap(df_scores, kind="direct", cluster=True)
                 plot.figsets(xangle=30)
                 if dir_save:
                     ips.figsave(dir_save + f"scores_clus{now_}.pdf")
     if all([plot_, y_true is not None, purpose == "classification"]):
-        try:
-            if len(models) > 3:
-                plot_validate_features(df_results)
-            else:
-                plot_validate_features_single(df_results, figsize=(12, 4 * len(models)))
-            if dir_save:
-                ips.figsave(dir_save + f"validate_features{now_}.pdf")
-        except Exception as e:
-            print(f"Error: 在画图的过程中出现了问题:{e}")
+        # try:
+        if len(models) > 3:
+            plot_validate_features(df_results,is_binary=is_binary)
+        else:
+            plot_validate_features_single(df_results, is_binary=is_binary)
+        if dir_save:
+            ips.figsave(dir_save + f"validate_features{now_}.pdf")
+        # except Exception as e:
+        #     print(f"Error: 在画图的过程中出现了问题:{e}")
     return df_results
 def cal_metrics(
-    y_true, y_pred, y_pred_proba=None, purpose="regression", average="weighted"
+    y_true, y_pred, y_pred_proba=None, is_binary=True,purpose="regression", average="weighted"
 ):
     """
     Calculate regression or classification metrics based on the purpose.
@@ -2879,16 +3168,49 @@ def cal_metrics(
         }
         # Confusion matrix to calculate specificity
-        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
-        validation_scores["specificity"] = (
-            tn / (tn + fp) if (tn + fp) > 0 else 0
-        )  # Specificity calculation
+        if is_binary:
+            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+            # Specificity calculation
+            validation_scores["specificity"] = (
+                tn / (tn + fp) if (tn + fp) > 0 else 0
+            )
+            if y_pred_proba is not None:
+                # Calculate ROC-AUC
+                validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
+                # PR-AUC (Precision-Recall AUC) calculation
+                validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
+        else: # multi-class
+            from sklearn.preprocessing import label_binarize
+            #* Multi-class ROC calculation
+            y_pred_proba = np.asarray(y_pred_proba)
+            classes = np.unique(y_true)
+            y_true_bin = label_binarize(y_true, classes=classes)
+            if isinstance(y_true, np.ndarray):
+                y_true = ips.df_encoder(data=pd.DataFrame(y_true), method='dum',prefix='Label')
+            # Initialize dictionaries to store FPR, TPR, and AUC for each class
+            fpr = dict()
+            tpr = dict()
+            roc_auc = dict()
+            for i, class_label in enumerate(classes):
+                fpr[class_label], tpr[class_label], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
+                roc_auc[class_label] = auc(fpr[class_label], tpr[class_label])
+            # Store the mean ROC AUC
+            try:
+                validation_scores["roc_auc"] = roc_auc_score(
+                    y_true, y_pred_proba, multi_class="ovr", average=average
+                )
+            except Exception as e:
+                y_pred_proba = y_pred_proba / y_pred_proba.sum(axis=1, keepdims=True)
+                validation_scores["roc_auc"] = roc_auc_score(
+                    y_true, y_pred_proba, multi_class="ovr", average=average
+                )
+            validation_scores["roc_auc_by_class"] = roc_auc  # Individual class AUCs
+            validation_scores["fpr"] = fpr
+            validation_scores["tpr"] = tpr
-        if y_pred_proba is not None:
-            # Calculate ROC-AUC
-            validation_scores["roc_auc"] = roc_auc_score(y_true, y_pred_proba)
-            # PR-AUC (Precision-Recall AUC) calculation
-            validation_scores["pr_auc"] = average_precision_score(y_true, y_pred_proba)
     else:
         raise ValueError(
             "Invalid purpose specified. Choose 'regression' or 'classification'."

py2ls/translator.py CHANGED Viewed

@@ -586,6 +586,8 @@ def replace_text(text, dict_replace=None, robust=True):
     Returns:
         str: The text after replacements have been made.
     """
+    if not all(text):
+        return ''
     # Default replacements for newline and tab characters
     default_replacements = {
         "\a": "",

{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: py2ls
-Version: 0.2.4.14
+Version: 0.2.4.15
 Summary: py(thon)2(too)ls
 Author: Jianfeng
 Author-email: Jianfeng.Liu0413@gmail.com

{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.15.dist-info}/RECORD RENAMED Viewed

@@ -235,7 +235,7 @@ py2ls/fetch_update.py,sha256=9LXj661GpCEFII2wx_99aINYctDiHni6DOruDs_fdt8,4752
 py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
 py2ls/ich2ls.py,sha256=3E9R8oVpyYZXH5PiIQgT3CN5NxLe4Dwtm2LwaeacE6I,21381
 py2ls/ips.py,sha256=O2QdLo6-vPbHvWtlVdtMA49LAn2y0CNVM27cxLbqqYA,271496
-py2ls/ml2ls.py,sha256=LovnWDV9ptdWuWwJF5EEdf3sGY4EniGBBNxRJJbzStw,112784
+py2ls/ml2ls.py,sha256=LutEbrIF2KcBdz8jnbR3EZ4WTjRTuVGPvskUsuX2ZoA,128551
 py2ls/mol.py,sha256=AZnHzarIk_MjueKdChqn1V6e4tUle3X1NnHSFA6n3Nw,10645
 py2ls/netfinder.py,sha256=R70NkrnO8LlXjT1y7bf2TN-yE4yOeAYhb0jDBiNp8XA,57536
 py2ls/ocr.py,sha256=5lhUbJufIKRSOL6wAWVLEo8TqMYSjoI_Q-IO-_4u3DE,31419
@@ -243,8 +243,8 @@ py2ls/plot.py,sha256=X0R1KK_UTdeJazjnqTqYvP-uWu6wY8szQHyJMsDDz2s,171515
 py2ls/setuptools-70.1.0-py3-none-any.whl,sha256=2bi3cUVal8ip86s0SOvgspteEF8SKLukECi-EWmFomc,882588
 py2ls/sleep_events_detectors.py,sha256=bQA3HJqv5qnYKJJEIhCyhlDtkXQfIzqksnD0YRXso68,52145
 py2ls/stats.py,sha256=qBn2rJmNa_QLLUqjwYqXUlGzqmW94sgA1bxJU2FC3r0,39175
-py2ls/translator.py,sha256=zBeq4pYZeroqw3DT-5g7uHfVqKd-EQptT6LJ-Adi8JY,34244
+py2ls/translator.py,sha256=77Tp_GjmiiwFbEIJD_q3VYpQ43XL9ZeJo6Mhl44mvh8,34284
 py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
-py2ls-0.2.4.14.dist-info/METADATA,sha256=SSjNh_FXmxwIF_Xx2fZvSGKZaX997x4sfJxUQckMuGY,20046
-py2ls-0.2.4.14.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-py2ls-0.2.4.14.dist-info/RECORD,,
+py2ls-0.2.4.15.dist-info/METADATA,sha256=MbwWj3zOohusA3UxDrIgR6S3Zms5tdWbcWjw9-dA57U,20046
+py2ls-0.2.4.15.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+py2ls-0.2.4.15.dist-info/RECORD,,

{py2ls-0.2.4.14.dist-info → py2ls-0.2.4.15.dist-info}/WHEEL RENAMED Viewed

File without changes

py2ls 0.2.4.14__py3-none-any.whl → 0.2.4.15__py3-none-any.whl

py2ls 0.2.4.14py3-none-any.whl → 0.2.4.15py3-none-any.whl