PyPI - pg-sui - Versions diffs - 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl - Mend

pg-sui 1.0.2.1py3-none-any.whl → 1.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show

{pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
pg_sui-1.6.8.dist-info/RECORD +78 -0
{pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
pg_sui-1.6.8.dist-info/top_level.txt +1 -0
pgsui/__init__.py +35 -54
pgsui/_version.py +34 -0
pgsui/cli.py +635 -0
pgsui/data_processing/config.py +576 -0
pgsui/data_processing/containers.py +1782 -0
pgsui/data_processing/transformers.py +121 -1103
pgsui/electron/app/__main__.py +5 -0
pgsui/electron/app/icons/icons/1024x1024.png +0 -0
pgsui/electron/app/icons/icons/128x128.png +0 -0
pgsui/electron/app/icons/icons/16x16.png +0 -0
pgsui/electron/app/icons/icons/24x24.png +0 -0
pgsui/electron/app/icons/icons/256x256.png +0 -0
pgsui/electron/app/icons/icons/32x32.png +0 -0
pgsui/electron/app/icons/icons/48x48.png +0 -0
pgsui/electron/app/icons/icons/512x512.png +0 -0
pgsui/electron/app/icons/icons/64x64.png +0 -0
pgsui/electron/app/icons/icons/icon.icns +0 -0
pgsui/electron/app/icons/icons/icon.ico +0 -0
pgsui/electron/app/main.js +189 -0
pgsui/electron/app/package-lock.json +6893 -0
pgsui/electron/app/package.json +50 -0
pgsui/electron/app/preload.js +15 -0
pgsui/electron/app/server.py +146 -0
pgsui/electron/app/ui/logo.png +0 -0
pgsui/electron/app/ui/renderer.js +130 -0
pgsui/electron/app/ui/styles.css +59 -0
pgsui/electron/app/ui/ui_shim.js +72 -0
pgsui/electron/bootstrap.py +43 -0
pgsui/electron/launch.py +59 -0
pgsui/electron/package.json +14 -0
pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
pgsui/impute/deterministic/imputers/mode.py +679 -0
pgsui/impute/deterministic/imputers/nmf.py +221 -0
pgsui/impute/deterministic/imputers/phylo.py +971 -0
pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
pgsui/impute/supervised/base.py +339 -0
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
pgsui/impute/supervised/imputers/random_forest.py +287 -0
pgsui/impute/unsupervised/base.py +924 -0
pgsui/impute/unsupervised/callbacks.py +89 -263
pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
pgsui/impute/unsupervised/imputers/vae.py +957 -0
pgsui/impute/unsupervised/loss_functions.py +158 -0
pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
pgsui/impute/unsupervised/models/vae_model.py +259 -618
pgsui/impute/unsupervised/nn_scorers.py +215 -0
pgsui/utils/classification_viz.py +591 -0
pgsui/utils/misc.py +35 -480
pgsui/utils/plotting.py +514 -824
pgsui/utils/scorers.py +212 -438
pg_sui-1.0.2.1.dist-info/RECORD +0 -75
pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
pgsui/example_data/phylip_files/test_n10.phy +0 -118
pgsui/example_data/phylip_files/test_n100.phy +0 -118
pgsui/example_data/phylip_files/test_n2.phy +0 -118
pgsui/example_data/phylip_files/test_n500.phy +0 -118
pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
pgsui/example_data/trees/test.iqtree +0 -376
pgsui/example_data/trees/test.qmat +0 -5
pgsui/example_data/trees/test.rate +0 -2033
pgsui/example_data/trees/test.tre +0 -1
pgsui/example_data/trees/test_n10.rate +0 -19
pgsui/example_data/trees/test_n100.rate +0 -109
pgsui/example_data/trees/test_n500.rate +0 -509
pgsui/example_data/trees/test_siterates.txt +0 -2024
pgsui/example_data/trees/test_siterates_n10.txt +0 -10
pgsui/example_data/trees/test_siterates_n100.txt +0 -100
pgsui/example_data/trees/test_siterates_n500.txt +0 -500
pgsui/example_data/vcf_files/test.vcf +0 -244
pgsui/example_data/vcf_files/test.vcf.gz +0 -0
pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
pgsui/impute/estimators.py +0 -735
pgsui/impute/impute.py +0 -1486
pgsui/impute/simple_imputers.py +0 -1439
pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
pgsui/impute/unsupervised/keras_classifiers.py +0 -702
pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
pgsui/pg_sui.py +0 -261
pgsui/utils/sequence_tools.py +0 -407
simulation/sim_benchmarks.py +0 -333
simulation/sim_treeparams.py +0 -475
test/__init__.py +0 -0
test/pg_sui_simtest.py +0 -215
test/pg_sui_testing.py +0 -523
test/test.py +0 -297
test/test_pgsui.py +0 -374
test/test_tkc.py +0 -214
{pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
/pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
/pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
{simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0

pgsui/utils/plotting.py CHANGED Viewed

@@ -1,920 +1,610 @@
-import os
-import sys
-from itertools import cycle
-from pathlib import Path
+import logging
 import warnings
+from pathlib import Path
+from typing import Dict, List, Literal, Optional, Sequence
-warnings.simplefilter(action="ignore", category=FutureWarning)
+import matplotlib as mpl
+# Use Agg backend for headless plotting
+mpl.use("Agg")
+import matplotlib.pyplot as plt
 import numpy as np
+import optuna
 import pandas as pd
-import matplotlib.pyplot as plt
 import seaborn as sns
-import plotly.express as px
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
-from sklearn_genetic.utils import logbook_to_pandas
-from sklearn.metrics import ConfusionMatrixDisplay
-try:
-    from . import misc
-except (ModuleNotFoundError, ValueError, ImportError):
-    from utils import misc
+import torch
+from optuna.exceptions import ExperimentalWarning
+from sklearn.metrics import (
+    ConfusionMatrixDisplay,
+    auc,
+    average_precision_score,
+    precision_recall_curve,
+    roc_curve,
+)
+from sklearn.preprocessing import label_binarize
+from snpio.utils.logging import LoggerManager
+from pgsui.utils import misc
+# Quiet Matplotlib/fontTools INFO logging when saving PDF/SVG
+for name in (
+    "fontTools",
+    "fontTools.subset",
+    "fontTools.ttLib",
+    "matplotlib.font_manager",
+):
+    lg = logging.getLogger(name)
+    lg.setLevel(logging.WARNING)
+    lg.propagate = False
 class Plotting:
-    """Functions for plotting imputer scoring and results."""
-    @staticmethod
-    def plot_grid_search(cv_results, nn_method, prefix):
-        """Plot cv_results\_ from a grid search for each parameter.
-        Saves a figure to disk.
+    """Class for plotting imputer scoring and results.
+    This class is used to plot the performance metrics of imputation models. It can plot ROC and Precision-Recall curves, model history, and the distribution of genotypes in the dataset.
+    Example:
+        >>> from pgsui import Plotting
+        >>> plotter = Plotting(model_name="ImputeVAE")
+        >>> plotter.plot_metrics(metrics, num_classes)
+        >>> plotter.plot_history(history)
+        >>> plotter.plot_certainty_heatmap(y_certainty)
+        >>> plotter.plot_confusion_matrix(y_true_1d, y_pred_1d)
+        >>> plotter.plot_gt_distribution(df)
+        >>> plotter.plot_label_clusters(z_mean, z_log_var)
+    Attributes:
+        model_name (str): Name of the model.
+        prefix (str): Prefix for the output directory.
+        plot_format (Literal["pdf", "png", "jpeg", "jpg"]): Format for the plots ('pdf', 'png', 'jpeg', 'jpg').
+        plot_fontsize (int): Font size for the plots.
+        plot_dpi (int): Dots per inch for the plots.
+        title_fontsize (int): Font size for the plot titles.
+        show_plots (bool): Whether to display the plots.
+        output_dir (Path): Directory where plots will be saved.
+        logger (logging.Logger): Logger instance for logging messages.
+    """
+    def __init__(
+        self,
+        model_name: str,
+        *,
+        prefix: str = "pgsui",
+        plot_format: Literal["pdf", "png", "jpeg", "jpg"] = "pdf",
+        plot_fontsize: int = 18,
+        plot_dpi: int = 300,
+        title_fontsize: int = 20,
+        despine: bool = True,
+        show_plots: bool = False,
+        verbose: int = 0,
+        debug: bool = False,
+    ) -> None:
+        """Initialize the Plotting object.
+        This class is used to plot the performance metrics of imputation models. It can plot ROC and Precision-Recall curves, model history, and the distribution of genotypes in the dataset.
         Args:
-            cv_results (np.ndarray): the cv_results\_ attribute from a trained grid search object.
-            nn_method (str): Neural network algorithm name.
-            prefix (str): Prefix to use for saving the plot to file.
+            model_name (str): Name of the model.
+            prefix (str, optional): Prefix for the output directory. Defaults to 'pgsui'.
+            plot_format (Literal["pdf", "png", "jpeg", "jpg"]): Format for the plots ('pdf', 'png', 'jpeg', 'jpg'). Defaults to 'pdf'.
+            plot_fontsize (int): Font size for the plots. Defaults to 18.
+            plot_dpi (int): Dots per inch for the plots. Defaults to 300.
+            title_fontsize (int): Font size for the plot titles. Defaults to 20.
+            despine (bool): Whether to remove the top and right spines from the plots. Defaults to True.
+            show_plots (bool): Whether to display the plots. Defaults to False.
+            verbose (int): Verbosity level for logging. Defaults to 0.
+            debug (bool): Whether to enable debug mode. Defaults to False.
         """
-        ## Results from grid search
-        results = pd.DataFrame(cv_results)
-        means_test = [col for col in results if col.startswith("mean_test_")]
-        filter_col = [col for col in results if col.startswith("param_")]
-        params_df = results[filter_col].astype(str)
-        for i, col in enumerate(means_test):
-            params_df[col] = results[means_test[i]]
-        # Get number of needed subplot rows.
-        tot = len(filter_col)
-        cols = 4
-        rows = int(np.ceil(tot / cols))
-        fig = plt.figure(1, figsize=(20, 10))
-        fig.tight_layout(pad=3.0)
-        # Set font properties.
-        font = {"size": 12}
-        plt.rc("font", **font)
-        for i, p in enumerate(filter_col, start=1):
-            ax = fig.add_subplot(rows, cols, i)
-            # Plot each metric.
-            for col in means_test:
-                # Get maximum score for each parameter setting.
-                df_plot = params_df.groupby(p)[col].agg("max")
-                # Convert to float if not supposed to be string.
-                try:
-                    df_plot.index = df_plot.index.astype(float)
-                except TypeError:
-                    pass
-                # Sort by index (numerically if possible).
-                df_plot = df_plot.sort_index()
-                # Remove prefix from score name.
-                col_new_name = col[len("mean_test_") :]
-                ax.plot(
-                    df_plot.index.astype(str),
-                    df_plot.values,
-                    "-o",
-                    label=col_new_name,
-                )
-                ax.legend(loc="best")
-            param_new_name = p[len("param_") :]
-            ax.set_xlabel(param_new_name.lower())
-            ax.set_ylabel("Max Score")
-            ax.set_ylim([0, 1])
-        fig.savefig(
-            os.path.join(
-                f"{prefix}_output",
-                "plots",
-                "Unsupervised",
-                nn_method,
-                "gridsearch_metrics.pdf",
-            ),
-            bbox_inches="tight",
-            facecolor="white",
+        logman = LoggerManager(
+            name=__name__, prefix=prefix, verbose=verbose, debug=debug
         )
+        self.logger = logman.get_logger()
+        self.model_name = model_name
+        self.prefix = prefix
+        self.plot_format = plot_format
+        self.plot_fontsize = plot_fontsize
+        self.plot_dpi = plot_dpi
+        self.title_fontsize = title_fontsize
+        self.show_plots = show_plots
+        if self.plot_format.startswith("."):
+            self.plot_format = self.plot_format.lstrip(".")
+        self.param_dict = {
+            "axes.labelsize": self.plot_fontsize,
+            "axes.titlesize": self.title_fontsize,
+            "axes.spines.top": despine,
+            "axes.spines.right": despine,
+            "xtick.labelsize": self.plot_fontsize,
+            "ytick.labelsize": self.plot_fontsize,
+            "legend.fontsize": self.plot_fontsize,
+            "legend.facecolor": "white",
+            "figure.titlesize": self.title_fontsize,
+            "figure.dpi": self.plot_dpi,
+            "figure.facecolor": "white",
+            "axes.linewidth": 2.0,
+            "lines.linewidth": 2.0,
+            "font.size": self.plot_fontsize,
+            "savefig.bbox": "tight",
+            "savefig.facecolor": "white",
+            "savefig.dpi": self.plot_dpi,
+        }
-    @staticmethod
-    def plot_metrics(metrics, num_classes, prefix, nn_method):
-        """Plot AUC-ROC and Precision-Recall performance metrics for neural network classifier.
+        mpl.rcParams.update(self.param_dict)
-        Saves plot to PDF file on disk.
+        unsuper = {"ImputeVAE", "ImputeNLPCA", "ImputeAutoencoder", "ImputeUBP"}
+        det = {
+            "ImputeRefAllele",
+            "ImputeMostFrequent",
+            "ImputeMostFrequentPerPop",
+            "ImputePhylo",
+        }
+        sup = {"ImputeRandomForest", "ImputeHistGradientBoosting"}
+        if model_name in unsuper:
+            plot_dir = "Unsupervised"
+        elif model_name in det:
+            plot_dir = "Deterministic"
+        elif model_name in sup:
+            plot_dir = "Supervised"
+        else:
+            msg = f"model_name '{model_name}' not recognized."
+            self.logger.error(msg)
+            raise ValueError(msg)
-        Args:
-            metrics (Dict[str, Any]): Per-class, micro, and macro-averaged metrics including accuracy, ROC-AUC, and Precision-Recall with Average Precision scores.
+        self.output_dir = Path(f"{self.prefix}_output", plot_dir)
+        self.output_dir = self.output_dir / "plots" / model_name
+        self.output_dir.mkdir(parents=True, exist_ok=True)
-            num_classes (int): Number of classes evaluated.
+    def plot_tuning(
+        self,
+        study: optuna.study.Study,
+        model_name: str,
+        target_name: str = "Objective Value",
+    ) -> None:
+        """Plot the optimization history of a study.
-            prefix (str): Prefix to use for output plot.
+        This method plots the optimization history of a study. The plot is saved to disk as a ``<plot_format>`` file.
-            nn_method (str): Neural network algorithm being used.
+        Args:
+            study (optuna.study.Study): Optuna study object.
+            model_name (str): Name of the model.
+            target_name (str, optional): Name of the target value. Defaults to 'Objective Value'.
         """
-        # Set font properties.
-        font = {"size": 12}
-        plt.rc("font", **font)
-        fn = os.path.join(
-            f"{prefix}_output",
-            "plots",
-            "Unsupervised",
-            nn_method,
-            f"auc_pr_curves.pdf",
-        )
-        fig = plt.figure(figsize=(20, 10))
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=UserWarning)
+            warnings.filterwarnings("ignore", category=FutureWarning)
+            warnings.filterwarnings("ignore", category=ExperimentalWarning)
-        acc = round(metrics["accuracy"] * 100, 2)
-        ham = round(metrics["hamming"], 2)
+            od = self.output_dir / "optimize"
+            target_name = target_name.title()
-        fig.suptitle(
-            f"Performance Metrics\nAccuracy: {acc}\nHamming Loss: {ham}"
-        )
-        axs = fig.subplots(nrows=1, ncols=2)
-        plt.subplots_adjust(hspace=0.5)
-        # Line weight
-        lw = 2
-        roc_auc = metrics["roc_auc"]
-        pr_ap = metrics["precision_recall"]
-        metric_list = [roc_auc, pr_ap]
-        for metric, ax in zip(metric_list, axs):
-            if "fpr_micro" in metric:
-                prefix1 = "fpr"
-                prefix2 = "tpr"
-                lab1 = "ROC"
-                lab2 = "AUC"
-                xlab = "False Positive Rate"
-                ylab = "True Positive Rate"
-                title = "Receiver Operating Characteristic (ROC)"
-                baseline = [0, 1]
-            elif "recall_micro" in metric:
-                prefix1 = "recall"
-                prefix2 = "precision"
-                lab1 = "Precision-Recall"
-                lab2 = "AP"
-                xlab = "Recall"
-                ylab = "Precision"
-                title = "Precision-Recall"
-                baseline = [metric["baseline"], metric["baseline"]]
-                # Plot iso-f1 curves.
-                f_scores = np.linspace(0.2, 0.8, num=4)
-                for i, f_score in enumerate(f_scores):
-                    x = np.linspace(0.01, 1)
-                    y = f_score * x / (2 * x - f_score)
-                    ax.plot(
-                        x[y >= 0],
-                        y[y >= 0],
-                        color="gray",
-                        alpha=0.2,
-                        linewidth=lw,
-                        label="Iso-F1 Curves" if i == 0 else "",
-                    )
-                    ax.annotate(f"F1={f_score:0.1f}", xy=(0.9, y[45] + 0.02))
-            # Plot ROC curves.
-            ax.plot(
-                metric[f"{prefix1}_micro"],
-                metric[f"{prefix2}_micro"],
-                label=f"Micro-averaged {lab1} Curve ({lab2} = {metric['micro']:.2f})",
-                color="deeppink",
-                linestyle=":",
-                linewidth=4,
+            ax = optuna.visualization.matplotlib.plot_optimization_history(
+                study, target_name=target_name
             )
-            ax.plot(
-                metric[f"{prefix1}_macro"],
-                metric[f"{prefix2}_macro"],
-                label=f"Macro-averaged {lab1} Curve ({lab2} = {metric['macro']:.2f})",
-                color="navy",
-                linestyle=":",
-                linewidth=4,
+            ax.set_title(f"{model_name} Optimization History")
+            ax.set_xlabel("Trial")
+            ax.set_ylabel(target_name)
+            ax.legend(
+                loc="best",
+                shadow=True,
+                fancybox=True,
+                fontsize=mpl.rcParamsDefault["legend.fontsize"],
             )
-            colors = cycle(["aqua", "darkorange", "cornflowerblue"])
-            for i, color in zip(range(num_classes), colors):
-                if f"{prefix1}_{i}" in metric:
-                    ax.plot(
-                        metric[f"{prefix1}_{i}"],
-                        metric[f"{prefix2}_{i}"],
-                        color=color,
-                        lw=lw,
-                        label=f"{lab1} Curve of class {i} ({lab2} = {metric[i]:.2f})",
-                    )
-            if "fpr_micro" in metric:
-                # Make center baseline
-                ax.plot(
-                    baseline,
-                    baseline,
-                    "k--",
-                    linewidth=lw,
-                    label="No Classification Skill",
-                )
-            else:
-                ax.plot(
-                    [0, 1],
-                    baseline,
-                    "k--",
-                    linewidth=lw,
-                    label="No Classification Skill",
-                )
-            ax.set_xlim(0.0, 1.0)
-            ax.set_ylim(0.0, 1.05)
-            ax.set_xlabel(f"{xlab}")
-            ax.set_ylabel(f"{ylab}")
-            ax.set_title(f"{title}")
-            ax.legend(loc="best")
-        fig.savefig(fn, bbox_inches="tight", facecolor="white")
-        plt.close()
-        plt.clf()
-        plt.cla()
-    @staticmethod
-    def plot_search_space(
-        estimator,
-        height=2,
-        s=25,
-        features=None,
-    ):
-        """Make density and contour plots for showing search space during grid search.
-        Modified from sklearn-genetic-opt function to implement exception handling.
+            fn = od / f"optuna_optimization_history.{self.plot_format}"
-        Args:
-            estimator (sklearn estimator object): A fitted estimator from :class:`~sklearn_genetic.GASearchCV`.
-            height (float, optional): Height of each facet. Defaults to 2.
-            s (float, optional): Size of the markers in scatter plot. Defaults to 5.
-            features (list, optional): Subset of features to plot, if ``None`` it plots all the features by default. Defaults to None.
-        Returns:
-            g (seaborn.PairGrid): Pair plot of the used hyperparameters during the search.
-        """
-        sns.set_style("white")
-        df = logbook_to_pandas(estimator.logbook)
-        if features:
-            _stats = df[features]
-        else:
-            variables = [*estimator.space.parameters, "score"]
-            _stats = df[variables]
-        g = sns.PairGrid(_stats, diag_sharey=False, height=height)
+            if not fn.parent.exists():
+                fn.parent.mkdir(parents=True, exist_ok=True)
-        g = g.map_upper(sns.scatterplot, s=s, color="r", alpha=0.2)
+            plt.savefig(fn)
+            plt.close()
-        try:
-            g = g.map_lower(
-                sns.kdeplot,
-                shade=True,
-                cmap=sns.color_palette("ch:s=.25,rot=-.25", as_cmap=True),
+            ax = optuna.visualization.matplotlib.plot_edf(
+                study, target_name=target_name
             )
-        except np.linalg.LinAlgError as err:
-            if "singular matrix" in str(err).lower():
-                g = g.map_lower(sns.scatterplot, s=s, color="b", alpha=1.0)
-            else:
-                raise
-        try:
-            g = g.map_diag(
-                sns.kdeplot,
-                shade=True,
-                palette="crest",
-                alpha=0.2,
-                color="red",
+            ax.set_title(f"{model_name} Empirical Distribution Function (EDF)")
+            ax.set_xlabel(target_name)
+            ax.set_ylabel(f"{model_name} Cumulative Probability")
+            ax.legend(
+                loc="best",
+                shadow=True,
+                fancybox=True,
+                fontsize=mpl.rcParamsDefault["legend.fontsize"],
             )
-        except np.linalg.LinAlgError as err:
-            if "singular matrix" in str(err).lower():
-                g = g.map_diag(sns.histplot, color="red", alpha=1.0, kde=False)
-        return g
-    @staticmethod
-    def visualize_missingness(
-        genotype_data,
-        df,
-        zoom=True,
-        prefix="imputer",
-        horizontal_space=0.6,
-        vertical_space=0.6,
-        bar_color="gray",
-        heatmap_palette="magma",
-        plot_format="pdf",
-        dpi=300,
-    ):
-        """Make multiple plots to visualize missing data.
-        Args:
-            genotype_data (GenotypeData): Initialized GentoypeData object.
-            df (pandas.DataFrame): DataFrame with snps to visualize.
-            zoom (bool, optional): If True, zooms in to the missing proportion range on some of the plots. If False, the plot range is fixed at [0, 1]. Defaults to True.
+            plt.savefig(fn.with_stem("optuna_edf_plot"))
+            plt.close()
-            prefix (str, optional): Prefix for output directory and files. Plots and files will be written to a directory called <prefix>_reports. The report directory will be created if it does not already exist. If prefix is None, then the reports directory will not have a prefix. Defaults to 'imputer'.
+            ax = optuna.visualization.matplotlib.plot_param_importances(
+                study, target_name=target_name
+            )
+            ax.set_xlabel("Parameter Importance")
+            ax.set_ylabel("Parameter")
+            ax.legend(loc="best", shadow=True, fancybox=True)
-            horizontal_space (float, optional): Set width spacing between subplots. If your plot are overlapping horizontally, increase horizontal_space. If your plots are too far apart, decrease it. Defaults to 0.6.
+            plt.savefig(fn.with_stem("optuna_param_importances_plot"))
+            plt.close()
-            vertical_space (float, optioanl): Set height spacing between subplots. If your plots are overlapping vertically, increase vertical_space. If your plots are too far apart, decrease it. Defaults to 0.6.
+            ax = optuna.visualization.matplotlib.plot_timeline(study)
+            ax.set_title(f"{model_name} Timeline Plot")
+            ax.set_xlabel("Datetime")
+            ax.set_ylabel("Trial")
+            plt.savefig(fn.with_stem("optuna_timeline_plot"))
+            plt.close()
-            bar_color (str, optional): Color of the bars on the non-stacked barplots. Can be any color supported by matplotlib. See matplotlib.pyplot.colors documentation. Defaults to 'gray'.
+            # Reset the style from Optuna's plotting.
+            sns.set_style("white", rc=self.param_dict)
+            mpl.rcParams.update(self.param_dict)
-            heatmap_palette (str, optional): Palette to use for heatmap plot. Can be any palette supported by seaborn. See seaborn documentation. Defaults to 'magma'.
+    def plot_metrics(
+        self,
+        y_true: np.ndarray,
+        y_pred_proba: np.ndarray,
+        metrics: Dict[str, float],
+        label_names: Optional[Sequence[str]] = None,
+        prefix: str = "",
+    ) -> None:
+        """Plot multi-class ROC-AUC and Precision-Recall curves.
-            plot_format (str, optional): Format to save plots. Can be any of the following: "pdf", "png", "svg", "ps", "eps". Defaults to "pdf".
+        This method plots the multi-class ROC-AUC and Precision-Recall curves. The plot is saved to disk as a ``<plot_format>`` file.
-            dpi (int): The resolution in dots per inch. Defaults to 300.
+        Args:
+            y_true (np.ndarray): 1D array of true integer labels in [0, n_classes-1].
+            y_pred_proba (np.ndarray): (n_samples, n_classes) array of predicted probabilities.
+            metrics (Dict[str, float]): Dict of summary metrics to annotate the figure.
+            label_names (Optional[Sequence[str]]): Optional sequence of class names (length must equal n_classes).
+                If provided, legends will use these names instead of 'Class i'.
+            prefix (str): Optional prefix for the output filename.
-        Returns:
-            pandas.DataFrame: Per-locus missing data proportions.
-            pandas.DataFrame: Per-individual missing data proportions.
-            pandas.DataFrame: Per-population + per-locus missing data proportions.
-            pandas.DataFrame: Per-population missing data proportions.
-            pandas.DataFrame: Per-individual and per-population missing data proportions.
+        Raises:
+            ValueError: If model_name is not recognized (legacy guard).
         """
+        num_classes = y_pred_proba.shape[1]
-        loc, ind, poploc, poptotal, indpop = genotype_data.calc_missing(df)
-        ncol = 3
-        nrow = 1 if genotype_data.pops is None else 2
-        fig, axes = plt.subplots(nrow, ncol, figsize=(8, 11))
-        plt.subplots_adjust(wspace=horizontal_space, hspace=vertical_space)
-        fig.suptitle("Missingness Report")
-        ax = axes[0, 0]
-        ax.set_title("Per-Individual")
-        ax.barh(genotype_data.samples, ind, color=bar_color, height=1.0)
-        if not zoom:
-            ax.set_xlim([0, 1])
-        ax.set_ylabel("Sample")
-        ax.set_xlabel("Missing Prop.")
-        ax.tick_params(
-            axis="y",
-            which="both",
-            left=False,
-            right=False,
-            labelleft=False,
-        )
-        ax = axes[0, 1]
+        # Validate/normalize label names
+        if label_names is not None and len(label_names) != num_classes:
+            self.logger.warning(
+                f"plot_metrics: len(label_names)={len(label_names)} "
+                f"!= n_classes={num_classes}. Ignoring label_names."
+            )
+            label_names = None
+        if label_names is None:
+            label_names = [f"Class {i}" for i in range(num_classes)]
+        # Binarize y_true for one-vs-rest curves
+        y_true_bin = label_binarize(y_true, classes=np.arange(num_classes))
+        # Containers
+        fpr, tpr, roc_auc = {}, {}, {}
+        precision, recall, average_precision = {}, {}, {}
+        # Per-class ROC & PR
+        for i in range(num_classes):
+            fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
+            roc_auc[i] = auc(fpr[i], tpr[i])
+            precision[i], recall[i], _ = precision_recall_curve(
+                y_true_bin[:, i], y_pred_proba[:, i]
+            )
+            average_precision[i] = average_precision_score(
+                y_true_bin[:, i], y_pred_proba[:, i]
+            )
-        ax.set_title("Per-Locus")
-        ax.barh(
-            range(genotype_data.num_snps), loc, color=bar_color, height=1.0
+        # Micro-averages
+        fpr["micro"], tpr["micro"], _ = roc_curve(
+            y_true_bin.ravel(), y_pred_proba.ravel()
         )
-        if not zoom:
-            ax.set_xlim([0, 1])
-        ax.set_ylabel("Locus")
-        ax.set_xlabel("Missing Prop.")
-        ax.tick_params(
-            axis="y",
-            which="both",
-            left=False,
-            right=False,
-            labelleft=False,
+        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
+        precision["micro"], recall["micro"], _ = precision_recall_curve(
+            y_true_bin.ravel(), y_pred_proba.ravel()
+        )
+        average_precision["micro"] = average_precision_score(
+            y_true_bin, y_pred_proba, average="micro"
         )
-        id_vars = ["SampleID"]
-        if poptotal is not None:
-            ax = axes[0, 2]
-            ax.set_title("Per-Population Total")
-            ax.barh(poptotal.index, poptotal, color=bar_color, height=1.0)
-            if not zoom:
-                ax.set_xlim([0, 1])
-            ax.set_xlabel("Missing Prop.")
-            ax.set_ylabel("Population")
-            ax = axes[1, 0]
-            ax.set_title("Per-Population + Per-Locus")
-            npops = len(poploc.columns)
-            vmax = None if zoom else 1.0
-            sns.heatmap(
-                poploc,
-                vmin=0.0,
-                vmax=vmax,
-                cmap=sns.color_palette(heatmap_palette, as_cmap=True),
-                yticklabels=False,
-                cbar_kws={"label": "Missing Prop."},
-                ax=ax,
-            )
-            ax.set_xlabel("Population")
-            ax.set_ylabel("Locus")
-            id_vars.append("Population")
-        melt_df = indpop.isna()
-        melt_df["SampleID"] = genotype_data.samples
-        indpop["SampleID"] = genotype_data.samples
-        if poptotal is not None:
-            melt_df["Population"] = genotype_data.pops
-            indpop["Population"] = genotype_data.pops
-        melt_df = melt_df.melt(value_name="Missing", id_vars=id_vars)
-        melt_df.sort_values(by=id_vars[::-1], inplace=True)
-        melt_df["Missing"].replace(False, "Present", inplace=True)
-        melt_df["Missing"].replace(True, "Missing", inplace=True)
+        # Macro-average ROC
+        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
+        mean_tpr = np.zeros_like(all_fpr)
+        for i in range(num_classes):
+            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
+        mean_tpr /= num_classes
+        fpr["macro"], tpr["macro"] = all_fpr, mean_tpr
+        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
+        # Macro-average PR
+        all_recall = np.unique(np.concatenate([recall[i] for i in range(num_classes)]))
+        mean_precision = np.zeros_like(all_recall)
+        for i in range(num_classes):
+            # recall[i] increases, but precision[i] is given over decreasing thresholds
+            mean_precision += np.interp(all_recall, recall[i][::-1], precision[i][::-1])
+        mean_precision /= num_classes
+        average_precision["macro"] = average_precision_score(
+            y_true_bin, y_pred_proba, average="macro"
+        )
-        ax = axes[0, 2] if poptotal is None else axes[1, 1]
+        # Plot
+        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
-        ax.set_title("Per-Individual")
-        g = sns.histplot(
-            data=melt_df,
-            y="variable",
-            hue="Missing",
-            multiple="fill",
-            ax=ax,
+        # ROC
+        axes[0].plot(
+            fpr["micro"],
+            tpr["micro"],
+            label=f"Micro-average ROC (AUC = {roc_auc['micro']:.2f})",
+            linestyle=":",
+            linewidth=4,
         )
-        ax.tick_params(
-            axis="y",
-            which="both",
-            left=False,
-            right=False,
-            labelleft=False,
+        axes[0].plot(
+            fpr["macro"],
+            tpr["macro"],
+            label=f"Macro-average ROC (AUC = {roc_auc['macro']:.2f})",
+            linestyle="--",
+            linewidth=4,
         )
-        g.get_legend().set_title(None)
-        if poptotal is not None:
-            ax = axes[1, 2]
-            ax.set_title("Per-Population")
-            g = sns.histplot(
-                data=melt_df,
-                y="Population",
-                hue="Missing",
-                multiple="fill",
-                ax=ax,
+        for i in range(num_classes):
+            axes[0].plot(
+                fpr[i], tpr[i], label=f"{label_names[i]} ROC (AUC = {roc_auc[i]:.2f})"
             )
-            g.get_legend().set_title(None)
-        fig.savefig(
-            os.path.join(
-                f"{prefix}_output", "plots", f"missingness.{plot_format}"
-            ),
-            bbox_inches="tight",
-            facecolor="white",
+        axes[0].plot([0, 1], [0, 1], linestyle="--", color="black", label="Random")
+        axes[0].set_xlabel("False Positive Rate")
+        axes[0].set_ylabel("True Positive Rate")
+        axes[0].set_title("Multi-class ROC-AUC Curve")
+        axes[0].legend(
+            loc="upper center",
+            bbox_to_anchor=(0.5, -0.15),
+            fancybox=True,
+            shadow=True,
+            ncol=2,
         )
-        plt.cla()
-        plt.clf()
-        plt.close()
-        return loc, ind, poploc, poptotal, indpop
-    @staticmethod
-    def run_and_plot_pca(
-        original_genotype_data,
-        imputer_object,
-        prefix="imputer",
-        n_components=3,
-        center=True,
-        scale=False,
-        n_axes=2,
-        point_size=15,
-        font_size=15,
-        plot_format="pdf",
-        bottom_margin=0,
-        top_margin=0,
-        left_margin=0,
-        right_margin=0,
-        width=1088,
-        height=700,
-    ):
-        """Runs PCA and makes scatterplot with colors showing missingness.
-        Genotypes are plotted as separate shapes per population and colored according to missingness per individual.
-        This function is run at the end of each imputation method, but can be run independently to change plot and PCA parameters such as ``n_axes=3`` or ``scale=True``.
-        The imputed and original GenotypeData objects need to be passed to the function as positional arguments.
-        PCA (principal component analysis) scatterplot can have either two or three axes, set with the n_axes parameter.
-        The plot is saved as both an interactive HTML file and as a static image. Each population is represented by point shapes. The interactive plot has associated metadata when hovering over the points.
-        Files are saved to a reports directory as <prefix>_output/imputed_pca.<plot_format|html>. Supported image formats include: "pdf", "svg", "png", and "jpeg" (or "jpg").
-        Args:
-            original_genotype_data (GenotypeData): Original GenotypeData object that was input into the imputer.
-            imputer_object (Any imputer instance): Imputer object created when imputing. Can be any of the imputers, such as: ``ImputePhylo()``, ``ImputeUBP()``, and ``ImputeRandomForest()``.
-            original_012 (pandas.DataFrame, numpy.ndarray, or List[List[int]], optional): Original 012-encoded genotypes (before imputing). Missing values are encoded as -9. This object can be obtained as ``df = GenotypeData.genotypes012_df``.
-            prefix (str, optional): Prefix for report directory. Plots will be save to a directory called <prefix>_output/imputed_pca<html|plot_format>. Report directory will be created if it does not already exist. Defaults to "imputer".
-            n_components (int, optional): Number of principal components to include in the PCA. Defaults to 3.
-            center (bool, optional): If True, centers the genotypes to the mean before doing the PCA. If False, no centering is done. Defaults to True.
-            scale (bool, optional): If True, scales the genotypes to unit variance before doing the PCA. If False, no scaling is done. Defaults to False.
-            n_axes (int, optional): Number of principal component axes to plot. Must be set to either 2 or 3. If set to 3, a 3-dimensional plot will be made. Defaults to 2.
-            point_size (int, optional): Point size for scatterplot points. Defaults to 15.
-            plot_format (str, optional): Plot file format to use. Supported formats include: "pdf", "svg", "png", and "jpeg" (or "jpg"). An interactive HTML file is also created regardless of this setting. Defaults to "pdf".
-            bottom_margin (int, optional): Adjust bottom margin. If whitespace cuts off some of your plot, lower the corresponding margins. The default corresponds to that of plotly update_layout(). Defaults to 0.
-            top (int, optional): Adjust top margin. If whitespace cuts off some of your plot, lower the corresponding margins. The default corresponds to that of plotly update_layout(). Defaults to 0.
-            left_margin (int, optional): Adjust left margin. If whitespace cuts off some of your plot, lower the corresponding margins. The default corresponds to that of plotly update_layout(). Defaults to 0.
-            right_margin (int, optional): Adjust right margin. If whitespace cuts off some of your plot, lower the corresponding margins. The default corresponds to that of plotly update_layout(). Defaults to 0.
-            width (int, optional): Width of plot space. If your plot is cut off at the edges, even after adjusting the margins, increase the width and height. Try to keep the aspect ratio similar. Defaults to 1088.
-            height (int, optional): Height of plot space. If your plot is cut off at the edges, even after adjusting the margins, increase the width and height. Try to keep the aspect ratio similar. Defaults to 700.
-        Returns:
-            numpy.ndarray: PCA data as a numpy array with shape (n_samples, n_components).
-            sklearn.decomposision.PCA: Scikit-learn PCA object from sklearn.decomposision.PCA. Any of the sklearn.decomposition.PCA attributes can be accessed from this object. See sklearn documentation.
-        Examples:
-            >>>data = GenotypeData(
-            >>>    filename="snps.str",
-            >>>    filetype="structure2row",
-            >>>    popmapfile="popmap.txt",
-            >>>)
-            >>>
-            >>>ubp = ImputeUBP(genotype_data=data)
-            >>>
-            >>>components, pca = run_and_plot_pca(
-            >>>    data,
-            >>>    ubp,
-            >>>    scale=True,
-            >>>    center=True,
-            >>>    plot_format="png"
-            >>>)
-            >>>
-            >>>explvar = pca.explained_variance_ratio\_
-        """
-        report_path = os.path.join(f"{prefix}_output", "plots")
-        Path(report_path).mkdir(parents=True, exist_ok=True)
-        if n_axes > 3:
-            raise ValueError(
-                ">3 axes is not supported; n_axes must be either 2 or 3."
-            )
-        if n_axes < 2:
-            raise ValueError(
-                "<2 axes is not supported; n_axes must be either 2 or 3."
-            )
-        imputer = imputer_object.imputed
-        df = misc.validate_input_type(
-            imputer.genotypes012_df, return_type="df"
+        # PR
+        axes[1].plot(
+            recall["micro"],
+            precision["micro"],
+            label=f"Micro-average PR (AP = {average_precision['micro']:.2f})",
+            linestyle=":",
+            linewidth=4,
         )
-        original_df = misc.validate_input_type(
-            original_genotype_data.genotypes_012(fmt="pandas"),
-            return_type="df",
+        axes[1].plot(
+            all_recall,
+            mean_precision,
+            label=f"Macro-average PR (AP = {average_precision['macro']:.2f})",
+            linestyle="--",
+            linewidth=4,
         )
-        original_df.replace(-9, np.nan, inplace=True)
-        if center or scale:
-            # Center data to mean. Scaling to unit variance is off.
-            scaler = StandardScaler(with_mean=center, with_std=scale)
-            pca_df = scaler.fit_transform(df)
-        else:
-            pca_df = df.copy()
-        # Run PCA.
-        model = PCA(n_components=n_components)
-        components = model.fit_transform(pca_df)
-        df_pca = pd.DataFrame(
-            components[:, [0, 1, 2]], columns=["Axis1", "Axis2", "Axis3"]
+        for i in range(num_classes):
+            axes[1].plot(
+                recall[i],
+                precision[i],
+                label=f"{label_names[i]} PR (AP = {average_precision[i]:.2f})",
+            )
+        axes[1].plot([0, 1], [1, 0], linestyle="--", color="black", label="Random")
+        axes[1].set_xlabel("Recall")
+        axes[1].set_ylabel("Precision")
+        axes[1].set_title("Multi-class Precision-Recall Curve")
+        axes[1].legend(
+            loc="upper center",
+            bbox_to_anchor=(0.5, -0.15),
+            fancybox=True,
+            shadow=True,
+            ncol=2,
         )
-        df_pca["SampleID"] = original_genotype_data.samples
-        df_pca["Population"] = original_genotype_data.pops
-        df_pca["Size"] = point_size
-        _, ind, _, _, _ = imputer.calc_missing(original_df, use_pops=False)
-        df_pca["missPerc"] = ind
-        my_scale = [("rgb(19, 43, 67)"), ("rgb(86,177,247)")]  # ggplot default
-        z = "Axis3" if n_axes == 3 else None
-        labs = {
-            "Axis1": f"PC1 ({round(model.explained_variance_ratio_[0] * 100, 2)}%)",
-            "Axis2": f"PC2 ({round(model.explained_variance_ratio_[1] * 100, 2)}%)",
-            "missPerc": "Missing Prop.",
-            "Population": "Population",
-        }
+        # Title & save
+        fig.suptitle("\n".join([f"{k}: {v:.2f}" for k, v in metrics.items()]), y=1.35)
-        if z is not None:
-            labs[
-                "Axis3"
-            ] = f"PC3 ({round(model.explained_variance_ratio_[2] * 100, 2)}%)"
-            fig = px.scatter_3d(
-                df_pca,
-                x="Axis1",
-                y="Axis2",
-                z="Axis3",
-                color="missPerc",
-                symbol="Population",
-                color_continuous_scale=my_scale,
-                custom_data=["Axis3", "SampleID", "Population", "missPerc"],
-                size="Size",
-                size_max=point_size,
-                labels=labs,
-            )
-        else:
-            fig = px.scatter(
-                df_pca,
-                x="Axis1",
-                y="Axis2",
-                color="missPerc",
-                symbol="Population",
-                color_continuous_scale=my_scale,
-                custom_data=["Axis3", "SampleID", "Population", "missPerc"],
-                size="Size",
-                size_max=point_size,
-                labels=labs,
-            )
-        fig.update_traces(
-            hovertemplate="<br>".join(
-                [
-                    "Axis 1: %{x}",
-                    "Axis 2: %{y}",
-                    "Axis 3: %{customdata[0]}",
-                    "Sample ID: %{customdata[1]}",
-                    "Population: %{customdata[2]}",
-                    "Missing Prop.: %{customdata[3]}",
-                ]
-            ),
-        )
-        fig.update_layout(
-            showlegend=True,
-            margin=dict(
-                b=bottom_margin,
-                t=top_margin,
-                l=left_margin,
-                r=right_margin,
-            ),
-            width=width,
-            height=height,
-            legend_orientation="h",
-            legend_title="Population",
-            legend_title_font=dict(size=font_size),
-            legend_title_side="top",
-            font=dict(size=font_size),
-        )
-        fig.write_html(os.path.join(report_path, "imputed_pca.html"))
-        fig.write_image(
-            os.path.join(report_path, f"imputed_pca.{plot_format}"),
-        )
+        if prefix != "":
+            prefix = f"{prefix}_"
-        return components, model
+        out_name = f"{self.model_name}_{prefix}roc_pr_curves.{self.plot_format}"
+        fig.savefig(self.output_dir / out_name, bbox_inches="tight")
+        if self.show_plots:
+            plt.show()
+        plt.close(fig)
-    @staticmethod
-    def plot_history(lod, nn_method, prefix="imputer"):
+    def plot_history(self, history: Dict[str, List[float]]) -> None:
         """Plot model history traces. Will be saved to file.
+        This method plots the deep learning model history traces. The plot is saved to disk as a ``<plot_format>`` file.
         Args:
-            lod (List[tf.keras.callbacks.History]): List of history objects.
-            nn_method (str): Neural network method to plot. Possible options include: 'NLPCA', 'UBP', or 'VAE'. NLPCA and VAE get plotted the same, but UBP does it differently due to its three phases.
-            prefix (str, optional): Prefix to use for output directory. Defaults to 'imputer'.
+            history (Dict[str, List[float]]): Dictionary with lists of history objects. Keys should be "Train" and "Validation".
         Raises:
-            ValueError: nn_method must be either 'NLPCA', 'UBP', or 'VAE'.
+            ValueError: nn_method must be either 'ImputeNLPCA', 'ImputeUBP', 'ImputeAutoencoder', 'ImputeVAE'.
         """
-        if nn_method == "NLPCA" or nn_method == "VAE" or nn_method == "SAE":
-            title = nn_method
-            fn = os.path.join(
-                f"{prefix}_output",
-                "plots",
-                "Unsupervised",
-                nn_method,
-                "histplot.pdf",
-            )
-            # if nn_method == "VAE":
-            fig, axes = plt.subplots(1, 2)
-            ax1 = axes[0]
-            ax2 = axes[1]
-            fig.suptitle(title)
-            fig.tight_layout(h_pad=3.0, w_pad=3.0)
-            history = lod[0]
-            acctrain = "binary_accuracy"
-            accval = "val_binary_accuracy"
-            lossval = "val_loss"
+        if self.model_name not in {
+            "ImputeNLPCA",
+            "ImputeVAE",
+            "ImputeAutoencoder",
+            "ImputeUBP",
+        }:
+            msg = "nn_method must be either 'ImputeNLPCA', 'ImputeVAE', 'ImputeAutoencoder', 'ImputeUBP'."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        if self.model_name != "ImputeUBP":
+            fig, ax = plt.subplots(1, 1, figsize=(12, 8))
+            df = pd.DataFrame(history)
+            df = df.iloc[1:]
             # Plot train accuracy
-            ax1.plot(history[acctrain])
-            ax1.set_title("Model Accuracy")
-            ax1.set_ylabel("Accuracy")
-            ax1.set_xlabel("Epoch")
-            ax1.set_ylim(bottom=0.0, top=1.0)
-            ax1.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0])
-            labels = ["Train"]
-            # Plot validation accuracy
-            ax1.plot(history[accval])
-            labels.append("Validation")
-            ax1.legend(labels, loc="best")
+            ax.plot(df["Train"], c="blue", lw=3)
-            # else:
-            ax2.plot(history["loss"])
-            ax2.plot(history[lossval])
+            ax.set_title(f"{self.model_name} Loss per Epoch")
+            ax.set_ylabel("Loss")
+            ax.set_xlabel("Epoch")
+            ax.legend(["Train"], loc="best", shadow=True, fancybox=True)
-            ax2.set_title("Total Loss")
-            ax2.set_ylabel("Loss")
-            ax2.set_xlabel("Epoch")
-            ax2.legend(labels, loc="best")
+        else:
+            fig, ax = plt.subplots(3, 1, figsize=(12, 8))
-            fig.savefig(fn, bbox_inches="tight", facecolor="white")
+            for i, phase in enumerate(range(1, 4)):
+                train = pd.Series(history["Train"][f"Phase {phase}"])
+                train = train.iloc[1:]  # ignore first epoch
-            plt.close()
-            plt.clf()
-        elif nn_method == "UBP":
-            fig = plt.figure(figsize=(12, 16))
-            fig.suptitle(nn_method)
-            fig.tight_layout(h_pad=2.0, w_pad=2.0)
-            fn = os.path.join(
-                f"{prefix}_output",
-                "plots",
-                "Unsupervised",
-                nn_method,
-                "histplot.pdf",
-            )
+                # Plot train accuracy
+                ax[i].plot(train, c="blue", lw=3)
+                ax[i].set_title(f"{self.model_name}: Phase {phase} Loss per Epoch")
+                ax[i].set_ylabel("Loss")
+                ax[i].set_xlabel("Epoch")
+                ax[i].legend([f"Phase {phase}"], loc="best", shadow=True, fancybox=True)
-            idx = 1
-            for i, history in enumerate(lod, start=1):
-                plt.subplot(3, 2, idx)
-                title = f"Phase {i}"
+        fn = f"{self.model_name.lower()}_history_plot.{self.plot_format}"
+        fn = self.output_dir / fn
+        fig.savefig(fn)
-                # Plot model accuracy
-                ax = plt.gca()
-                ax.plot(history["binary_accuracy"])
-                ax.set_title(f"{title} Accuracy")
-                ax.set_ylabel("Accuracy")
-                ax.set_xlabel("Epoch")
-                ax.set_yticks([0.0, 0.25, 0.5, 0.75, 1.0])
+        if self.show_plots:
+            plt.show()
+        plt.close(fig)
-                # Plot validation accuracy
-                ax.plot(history["val_binary_accuracy"])
-                ax.legend(["Train", "Validation"], loc="best")
+    def plot_confusion_matrix(
+        self,
+        y_true_1d: np.ndarray,
+        y_pred_1d: np.ndarray,
+        label_names: Sequence[str] | None = None,
+        prefix: str = "",
+    ) -> None:
+        """Plot a confusion matrix with optional class labels.
-                # Plot model loss
-                plt.subplot(3, 2, idx + 1)
-                ax = plt.gca()
-                ax.plot(history["loss"])
-                ax.set_title(f"{title} Loss")
-                ax.set_ylabel("Loss (MSE)")
-                ax.set_xlabel("Epoch")
+        This method plots a confusion matrix using true and predicted labels. The plot is saved to disk as a ``<plot_format>`` file.
-                # Plot validation loss
-                ax.plot(history["val_loss"])
-                ax.legend(["Train", "Validation"], loc="best")
+        Args:
+            y_true_1d (np.ndarray): 1D array of true integer labels in [0, n_classes-1].
+            y_pred_1d (np.ndarray): 1D array of predicted integer labels in [0, n_classes-1].
+            label_names (Sequence[str] | None): Optional sequence of class names (length must equal n_classes). If provided, both the internal label order and displayed tick labels will respect this order (assumed to be 0..n-1).
+            prefix (str): Optional prefix for the output filename.
-                idx += 2
+        Notes:
+            - If `label_names` is None, the display labels default to the numeric class indices inferred from `y_true_1d ∪ y_pred_1d`.
+        """
+        y_true_1d = misc.validate_input_type(y_true_1d, return_type="array")
+        y_pred_1d = misc.validate_input_type(y_pred_1d, return_type="array")
-            plt.savefig(fn, bbox_inches="tight", facecolor="white")
+        if y_true_1d.ndim > 1:
+            y_true_1d = y_true_1d.flatten()
-            plt.close()
-            plt.clf()
+        if y_pred_1d.ndim > 1:
+            y_pred_1d = y_pred_1d.flatten()
+        # Determine class count/order
+        if label_names is not None:
+            n_classes = len(label_names)
+            labels = np.arange(n_classes)  # our y_* are ints 0..n-1
+            display_labels = list(map(str, label_names))
         else:
-            raise ValueError(
-                f"nn_method must be either 'NLPCA', 'UBP', or 'VAE', but got {nn_method}"
-            )
-    @staticmethod
-    def plot_certainty_heatmap(
-        y_certainty, sample_ids=None, nn_method="VAE", prefix="imputer"
-    ):
-        fig = plt.figure()
-        hm = sns.heatmap(
-            data=y_certainty,
-            cmap="viridis",
-            vmin=0.0,
-            vmax=1.0,
-            cbar_kws={"label": "Prob."},
-        )
-        hm.set_xlabel("Site")
-        hm.set_ylabel("Sample")
-        hm.set_title("Probabilities of Uncertain Sites")
-        fig.tight_layout()
-        fig.savefig(
-            os.path.join(
-                f"{prefix}_output",
-                "plots",
-                "Unsupervised",
-                nn_method,
-                "uncertainty_plot.png",
-            ),
-            bbox_inches="tight",
-            facecolor="white",
-        )
+            # Infer labels from data to keep matrix tight
+            labels = np.unique(np.concatenate([y_true_1d, y_pred_1d]))
+            display_labels = labels  # sklearn will convert to strings
-    @staticmethod
-    def plot_confusion_matrix(
-        y_true_1d, y_pred_1d, nn_method, prefix="imputer"
-    ):
         fig, ax = plt.subplots(1, 1, figsize=(15, 15))
-        ConfusionMatrixDisplay.from_predictions(
-            y_true=y_true_1d, y_pred=y_pred_1d, ax=ax
-        )
-        outfile = os.path.join(
-            f"{prefix}_output",
-            "plots",
-            "Unsupervised",
-            nn_method,
-            f"confusion_matrix_{nn_method}.png",
+        ConfusionMatrixDisplay.from_predictions(
+            y_true=y_true_1d,
+            y_pred=y_pred_1d,
+            labels=labels,
+            display_labels=display_labels,
+            ax=ax,
+            cmap="viridis",
+            colorbar=True,
         )
-        if os.path.isfile(outfile):
-            os.remove(outfile)
+        if prefix != "":
+            prefix = f"{prefix}_"
-        fig.savefig(outfile, facecolor="white")
+        out_name = (
+            f"{self.model_name.lower()}_{prefix}confusion_matrix.{self.plot_format}"
+        )
+        fig.savefig(self.output_dir / out_name, bbox_inches="tight")
+        if self.show_plots:
+            plt.show()
+        plt.close(fig)
-    @staticmethod
-    def plot_gt_distribution(df, plot_path):
-        df = misc.validate_input_type(df, return_type="df")
-        df_melt = pd.melt(df, value_name="Count")
-        cnts = df_melt["Count"].value_counts()
-        cnts.index.names = ["Genotype"]
-        cnts = pd.DataFrame(cnts).reset_index()
-        cnts.sort_values(by="Genotype", inplace=True)
-        cnts["Genotype"] = cnts["Genotype"].astype(str)
+    def plot_gt_distribution(
+        self,
+        X: np.ndarray | pd.DataFrame | list | torch.Tensor,
+        is_imputed: bool = False,
+    ) -> None:
+        """Plot genotype distribution (IUPAC or integer-encoded).
-        fig, ax = plt.subplots(1, 1, figsize=(15, 15))
-        g = sns.barplot(x="Genotype", y="Count", data=cnts, ax=ax)
-        g.set_xlabel("Integer-encoded Genotype")
-        g.set_ylabel("Count")
-        g.set_title("Genotype Counts")
-        for p in g.patches:
-            g.annotate(
-                f"{p.get_height():.1f}",
-                (p.get_x() + 0.25, p.get_height() + 0.01),
-                xytext=(0, 1),
-                textcoords="offset points",
-                va="bottom",
-            )
+        This plots counts for all genotypes present in X. It supports IUPAC single-letter genotypes and integer encodings. Missing markers '-', '.', '?' are normalized to 'N'. Bars are annotated with counts and percentages.
-        fig.savefig(
-            os.path.join(plot_path, "genotype_distributions.png"),
-            bbox_inches="tight",
-            facecolor="white",
+        Args:
+            X (np.ndarray | pd.DataFrame | list | torch.Tensor): Array-like genotype matrix. Rows=loci, cols=samples (any orientation is OK). Elements are IUPAC one-letter genotypes (e.g., 'A','C','G','T','N','R',...) or integers (e.g., 0/1/2[/3]).
+            is_imputed (bool): Whether these genotypes are imputed. Affects the title only. Defaults to False.
+        """
+        # Flatten X to a 1D Series
+        if isinstance(X, pd.DataFrame):
+            arr = X.values
+        elif torch.is_tensor(X):
+            arr = X.detach().cpu().numpy()
+        else:
+            arr = np.asarray(X)
+        s = pd.Series(arr.ravel())
+        # Detect string vs numeric encodings and normalize
+        if s.dtype.kind in ("O", "U", "S"):  # string-like → IUPAC path
+            s = s.astype(str).str.upper().replace({"-": "N", ".": "N", "?": "N"})
+            x_label = "Genotype (IUPAC)"
+            # Define canonical order: N, A/C/T/G, then IUPAC ambiguity codes.
+            canonical = ["A", "C", "T", "G"]
+            iupac_ambiguity = sorted(["M", "R", "W", "S", "Y", "K", "V", "H", "D", "B"])
+            base_order = ["N"] + canonical + iupac_ambiguity
+        else:  # numeric path (e.g., 0/1/2/[3], -1 for missing)
+            # Map common missing sentinels to 'N', keep others as strings for
+            # labeling
+            s = s.astype(float)  # allow NaN comparisons
+            s = s.where(~np.isin(s, [-1, np.nan]), other=np.nan)
+            s = s.fillna("N").astype(int, errors="ignore").astype(str)
+            x_label = "Genotype (Integer-encoded)"
+            # Support both ternary and quaternary encodings; keep a stable order
+            base_order = ["N", "0", "1", "2", "3"]
+        # Include any unexpected symbols at the end (sorted) so nothing is
+        # dropped
+        extras = sorted(set(s.unique()) - set(base_order))
+        full_order = base_order + [e for e in extras if e not in base_order]
+        # Count and reindex to show zero-count categories
+        counts = s.value_counts().reindex(full_order, fill_value=0)
+        df = counts.rename_axis("Genotype").reset_index(name="Count")
+        df["Percent"] = df["Count"] / df["Count"].sum() * 100
+        title = "Imputed Genotype Counts" if is_imputed else "Genotype Counts"
+        # --- Plot ---
+        fig, ax = plt.subplots(figsize=(8, 5))
+        sns.despine(fig=fig)
+        ax = sns.barplot(
+            data=df,
+            x="Genotype",
+            y="Percent",
+            hue="Genotype",
+            order=full_order,
+            errorbar=None,
+            ax=ax,
+            palette="Set1",
+            legend=False,
+            fill=True,
         )
-        plt.close()
-    @staticmethod
-    def plot_label_clusters(z_mean, labels, prefix="imputer"):
-        """Display a 2D plot of the classes in the latent space."""
-        fig, ax = plt.subplots(1, 1, figsize=(15, 15))
-        sns.scatterplot(x=z_mean[:, 0], y=z_mean[:, 1], ax=ax)
-        ax.set_xlabel("Latent Dimension 1")
-        ax.set_ylabel("Latent Dimension 2")
+        ax.set_xlabel(x_label)
+        ax.set_ylabel("Percent")
+        ax.set_title(title)
+        ax.set_ylim([0, 50])
-        outfile = os.path.join(
-            f"{prefix}_output",
-            "plots",
-            "Unsupervised",
-            "VAE",
-            "label_clusters.png",
-        )
+        fig.tight_layout()
-        if os.path.isfile(outfile):
-            os.remove(outfile)
+        suffix = "imputed" if is_imputed else "original"
+        fn = self.output_dir / f"gt_distributions_{suffix}.{self.plot_format}"
+        fig.savefig(fn, dpi=300)
-        fig.savefig(outfile, facecolor="white", bbox_inches="tight")
+        if self.show_plots:
+            plt.show()
+        plt.close(fig)

pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

Potentially problematic release.

pg-sui 1.0.2.1py3-none-any.whl → 1.6.8py3-none-any.whl