PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/impute/deterministic/imputers/mode.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # Standard library imports
+import copy
 import json
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
 # Third-party imports
 import matplotlib.pyplot as plt
@@ -11,14 +12,14 @@ from matplotlib.figure import Figure
 from plotly.graph_objs import Figure as PlotlyFigure
 from sklearn.exceptions import NotFittedError
 from sklearn.metrics import (
-    accuracy_score,
+    average_precision_score,
     classification_report,
-    f1_score,
-    precision_score,
-    recall_score,
+    jaccard_score,
+    matthews_corrcoef,
 )
 from snpio import GenotypeEncoder
 from snpio.utils.logging import LoggerManager
+from snpio.utils.misc import validate_input_type
 from pgsui.data_processing.config import apply_dot_overrides, load_yaml_to_dataclass
 from pgsui.data_processing.containers import MostFrequentConfig
@@ -54,6 +55,7 @@ def ensure_mostfrequent_config(
     if isinstance(config, str):
         return load_yaml_to_dataclass(config, MostFrequentConfig)
     if isinstance(config, dict):
+        config = copy.deepcopy(config)  # copy
         base = MostFrequentConfig()
         # honor optional top-level 'preset'
         preset = config.pop("preset", None)
@@ -77,9 +79,9 @@ def ensure_mostfrequent_config(
 class ImputeMostFrequent:
-    """Most-frequent (mode) imputer that mirrors DL evaluation on 0/1/2.
+    """Most-frequent (mode) deterministic imputer for 0/1/2 genotypes.
-    This imputer computes the most frequent genotype (mode) for each locus based on the training set and uses it to fill in missing values. It supports both global modes and population-specific modes if population data is provided. The imputer follows an evaluation protocol similar to deep learning models, including splitting the data into training and testing sets, masking observed cells in the test set for evaluation, and producing detailed classification reports and plots. It handles both diploid and haploid data, with special considerations for haploid scenarios. The imputer is designed to work seamlessly with genotype data encoded in 0/1/2 format, where -1 indicates missing values.
+    Computes the per-locus mode (globally or per population) from the training set and uses it to fill missing values. The evaluation protocol mirrors the DL imputers: train/test split with evaluation on either all observed test cells or a simulated-missing subset (depending on config), plus classification reports and plots. It handles both diploid and haploid data. Input genotypes are expected in 0/1/2 encoding with missing values represented by any negative integer. Output is returned as IUPAC strings via ``decode_012``.
     """
     def __init__(
@@ -109,14 +111,14 @@ class ImputeMostFrequent:
             tree_parser (TreeParser | None): Optional SNPio phylogenetic tree parser for nonrandom sim_strategy modes.
             config (MostFrequentConfig | dict | str | None): Configuration as a dataclass,
                 nested dict, or YAML path. If None, defaults are used.
-            overrides (dict | None): Flat dot-key overrides applied last with highest precedence, e.g. {'algo.by_populations': True, 'split.test_size': 0.3}.
+            overrides (Optional[dict]): Flat dot-key overrides applied last with highest precedence, e.g. {'algo.by_populations': True, 'split.test_size': 0.3}.
             simulate_missing (bool): Whether to simulate missing data if enabled in config. Defaults to True.
-            sim_strategy (Literal): Strategy for simulating missing data if enabled in config.
-            sim_prop (float): Proportion of data to simulate as missing if enabled in config.
+            sim_strategy (Literal["random", "random_weighted", "random_weighted_inv", "nonrandom", "nonrandom_weighted"]): Strategy for simulating missing data if enabled in config.
+            sim_prop (float): Proportion of data to simulate as missing if enabled in config. Default is 0.2.
             sim_kwargs (Optional[dict]): Additional keyword arguments for the simulated missing data transformer.
         Notes:
-            - This mirrors other config-driven models (AE/VAE/NLPCA/UBP).
+            - This mirrors other config-driven models (AE/VAE).
             - Evaluation split behavior uses cfg.split; plotting uses cfg.plot.
             - I/O/logging seeds and verbosity use cfg.io.
         """
@@ -151,18 +153,19 @@ class ImputeMostFrequent:
         self.rng = np.random.default_rng(cfg.io.seed)
         self.encoder = GenotypeEncoder(self.genotype_data)
-        # Work in 0/1/2 with -1 for missing (parity with DL modules)
-        X012 = self.encoder.genotypes_012.astype(np.int16, copy=False)
+        self.missing_internal = -1
-        # 2. In-place replacement of NaNs
-        # NOTE: X012 will be consumed to make ground_truth_
-        np.nan_to_num(X012, nan=-1.0, copy=False)
+        # include common missing value aliases
+        self.missing_aliases = {int(cfg.algo.missing), -9, -1}
-        X012[X012 < 0] = -1
-        self.X012_ = X012
-        self.num_features_ = X012.shape[1]
+        X = np.asarray(self.encoder.genotypes_012)
+        Xf = X.astype(np.float32, copy=False)
+        Xf = np.where(np.isnan(Xf), -1.0, Xf)
+        Xf[Xf < 0] = -1.0
+        self.X012_ = Xf.astype(np.int8, copy=False)
+        self.num_features_ = self.X012_.shape[1]
-        # Simulated-missing controls (mirror VAE/AE/NLPCA semantics where possible)
+        # Simulated-missing controls (mirror VAE/AE semantics where possible)
         sim_cfg = getattr(self.cfg, "sim", None)
         sim_cfg_kwargs = dict(getattr(sim_cfg, "sim_kwargs", {}) or {})
@@ -226,12 +229,11 @@ class ImputeMostFrequent:
         self.test_idx_: Optional[np.ndarray] = None
         self.X_train_df_: Optional[pd.DataFrame] = None
         self.ground_truth012_: Optional[np.ndarray] = None
-        self.metrics_: Dict[str, int | float] = {}
         self.X_imputed012_: Optional[np.ndarray] = None
         # Ploidy heuristic for 0/1/2 scoring parity
-        uniq = np.unique(self.X012_[self.X012_ != -1])
-        self.is_haploid_ = np.array_equal(np.sort(uniq), np.array([0, 2]))
+        self.ploidy = self.cfg.io.ploidy
+        self.is_haploid_ = self.ploidy == 1
         # Plotting (use config, not genotype_data fields)
         self.plot_format = cfg.plot.fmt
@@ -243,6 +245,11 @@ class ImputeMostFrequent:
         self.model_name = (
             "ImputeMostFrequentPerPop" if self.by_populations else "ImputeMostFrequent"
         )
+        # Output dirs
+        dirs = ["models", "plots", "metrics", "optimize", "parameters"]
+        self._create_model_directories(self.prefix, dirs)
         self.plotter_ = Plotting(
             self.model_name,
             prefix=self.prefix,
@@ -258,10 +265,6 @@ class ImputeMostFrequent:
             multiqc_section=f"PG-SUI: {self.model_name} Model Imputation",
         )
-        # Output dirs
-        dirs = ["models", "plots", "metrics", "optimize", "parameters"]
-        self._create_model_directories(self.prefix, dirs)
         if self.tree_parser is None and self.sim_strategy.startswith("nonrandom"):
             msg = "tree_parser is required for nonrandom and nonrandom_weighted simulated missing strategies."
             self.logger.error(msg)
@@ -280,14 +283,21 @@ class ImputeMostFrequent:
         # Work in DataFrame with NaN as missing for mode computation
         df_all = pd.DataFrame(self.ground_truth012_, dtype=np.float32)
-        df_all = df_all.replace(self.missing, np.nan)
-        df_all = df_all.replace(-9, np.nan)  # Just in case
+        df_all[df_all < 0] = np.nan
         # Modes from TRAIN rows only (per-locus)
         df_train = df_all.iloc[self.train_idx_].copy()
-        self.global_modes_ = {
-            col: self._series_mode(df_train[col]) for col in df_train.columns
-        }
+        modes = {}
+        for col in df_train.columns:
+            s = df_train[col].dropna()
+            if s.empty:
+                modes[col] = self.default
+            else:
+                vc = s.value_counts()
+                # deterministic tie-break: smallest genotype among ties
+                modes[col] = int(vc.index[vc.to_numpy() == vc.to_numpy().max()].min())
+        self.global_modes_ = modes
         self.group_modes_.clear()
         if self.by_populations:
@@ -304,13 +314,14 @@ class ImputeMostFrequent:
                 self.logger.error(msg)
                 raise ValueError(msg)
-        # ------------------------------
         # Simulated-missing mask (global → test-only)
-        # ------------------------------
         obs_mask = df_all.notna().to_numpy()  # observed = not NaN
-        n_samples, n_loci = obs_mask.shape
+        n_samples = obs_mask.shape[0]
         if self.simulate_missing:
+            X_for_sim = self.ground_truth012_.astype(np.float32, copy=True)
+            X_for_sim[X_for_sim < 0] = -9.0
             # Use the same transformer as VAE
             tr = SimMissingTransformer(
                 genotype_data=self.genotype_data,
@@ -322,11 +333,7 @@ class ImputeMostFrequent:
                 verbose=self.verbose,
                 **self.sim_kwargs,
             )
-            # Fit on 0/1/2 with -1 for missing, like VAE
-            X_for_sim = self.ground_truth012_.astype(float, copy=True)
-            X_for_sim[X_for_sim < 0] = np.nan
             tr.fit(X_for_sim)
             sim_mask_global = tr.sim_missing_mask_.astype(bool)
             # Don't simulate on already-missing cells
@@ -359,7 +366,7 @@ class ImputeMostFrequent:
         self.X_train_df_ = df_sim
         self.is_fit_ = True
-        # Save parameters (unchanged)
+        # Save parameters
         best_params = self.cfg.to_dict()
         params_fp = self.parameters_dir / "best_parameters.json"
         with open(params_fp, "w") as f:
@@ -389,11 +396,14 @@ class ImputeMostFrequent:
             msg = "Model is not fitted. Call fit() before transform()."
             self.logger.error(msg)
             raise NotFittedError(msg)
-        assert self.X_train_df_ is not None
+        assert (
+            self.X_train_df_ is not None
+        ), f"[{self.model_name}] X_train_df_ is not set after fit()."
         # 1) Impute the evaluation-masked copy (to compute metrics)
         imputed_eval_df = self._impute_df(self.X_train_df_)
-        X_imputed_eval = imputed_eval_df.to_numpy(dtype=np.int16)
+        X_imputed_eval = imputed_eval_df.to_numpy(dtype=np.int8)
         self.X_imputed012_ = X_imputed_eval
         # Evaluate like DL models (0/1/2, then 10-class from decoded strings)
@@ -401,22 +411,31 @@ class ImputeMostFrequent:
         # 2) Impute the FULL dataset (only true missings)
         df_missingonly = pd.DataFrame(self.ground_truth012_, dtype=np.float32)
-        df_missingonly.replace(self.missing, np.nan, inplace=True)
+        df_missingonly[df_missingonly < 0] = np.nan
         imputed_full_df = self._impute_df(df_missingonly)
-        X_imputed_full_012 = imputed_full_df.to_numpy(dtype=np.int16)
+        X_imputed_full_012 = imputed_full_df.to_numpy(dtype=np.int8)
+        neg = int(np.count_nonzero(X_imputed_full_012 < 0))
+        if neg:
+            msg = f"{neg} negative entries remain after REF imputation. Unique: {np.unique(X_imputed_full_012[X_imputed_full_012 < 0])[:10]}"
+            self.logger.error(msg)
+            raise RuntimeError(msg)
         # Plot distributions (parity with DL transform())
         if self.ground_truth012_ is None:
-            raise NotFittedError(
-                "ground_truth012_ is not set; cannot plot distributions."
-            )
+            msg = "ground_truth012_ is not set; cannot plot distributions."
+            self.logger.error(msg)
+            raise NotFittedError(msg)
+        imp_decoded = self.decode_012(X_imputed_full_012)
-        gt_decoded = self.encoder.decode_012(self.ground_truth012_)
-        imp_decoded = self.encoder.decode_012(X_imputed_full_012)
-        self.plotter_.plot_gt_distribution(gt_decoded, is_imputed=False)
-        self.plotter_.plot_gt_distribution(imp_decoded, is_imputed=True)
+        if self.show_plots:
+            gt_decoded = self.decode_012(self.ground_truth012_)
+            self.plotter_.plot_gt_distribution(gt_decoded, is_imputed=False)
+            self.plotter_.plot_gt_distribution(imp_decoded, is_imputed=True)
-        # Return IUPAC strings (same as DL .transform())
+        # Return IUPAC strings
         return imp_decoded
     def _impute_df(self, df_in: pd.DataFrame) -> pd.DataFrame:
@@ -452,7 +471,7 @@ class ImputeMostFrequent:
             df = df_in.fillna(modes)
         else:
             df = df_in.copy()
-        return df.astype(np.int16)
+        return df.astype(np.int8)
     def _impute_by_population_mode(self, df_in: pd.DataFrame) -> pd.DataFrame:
         """Impute missing cells in df_in using population-specific modes.
@@ -466,7 +485,7 @@ class ImputeMostFrequent:
             pd.DataFrame: DataFrame with missing values imputed.
         """
         if not df_in.isnull().values.any():
-            return df_in.astype(np.int16)
+            return df_in.astype(np.int8)
         df = df_in.copy()
         pops = pd.Series(self.pops, index=df.index)
@@ -489,7 +508,7 @@ class ImputeMostFrequent:
         mask = np.isnan(values)
         values[mask] = replacements[mask]
-        return pd.DataFrame(values, columns=df.columns, index=df.index).astype(np.int16)
+        return pd.DataFrame(values, columns=df.columns, index=df.index).astype(np.int8)
     def _series_mode(self, s: pd.Series) -> int:
         """Compute the mode of a pandas Series, ignoring NaNs.
@@ -505,11 +524,13 @@ class ImputeMostFrequent:
         s_valid = s.dropna().astype(int)
         if s_valid.empty:
             return self.default
         # Mode among {0,1,2}; if ties, pandas picks the smallest (okay)
         mode_val = int(s_valid.mode().iloc[0])
         if mode_val not in (0, 1, 2):
             # Safety: clamp to valid zygosity in case of odd inputs
             mode_val = self.default if self.default in (0, 1, 2) else 0
         return mode_val
     def _evaluate_and_report(self) -> None:
@@ -540,8 +561,8 @@ class ImputeMostFrequent:
         X_pred_eval = self.ground_truth012_.copy()
         X_pred_eval[self.sim_mask_] = self.X_imputed012_[self.sim_mask_]
-        y_true_dec = self.encoder.decode_012(self.ground_truth012_)
-        y_pred_dec = self.encoder.decode_012(X_pred_eval)
+        y_true_dec = self.decode_012(self.ground_truth012_)
+        y_pred_dec = self.decode_012(X_pred_eval)
         encodings_dict = {
             "A": 0,
@@ -565,43 +586,35 @@ class ImputeMostFrequent:
         y_true_10 = y_true_int[self.sim_mask_]
         y_pred_10 = y_pred_int[self.sim_mask_]
+        m = (y_true_10 >= 0) & (y_pred_10 >= 0)
+        y_true_10, y_pred_10 = y_true_10[m], y_pred_10[m]
+        if y_true_10.size == 0:
+            self.logger.warning(
+                "No valid IUPAC test cells; skipping 10-class evaluation."
+            )
+            return
         self._evaluate_iupac10_and_plot(y_true_10, y_pred_10)
     def _evaluate_012_and_plot(self, y_true: np.ndarray, y_pred: np.ndarray) -> None:
         """0/1/2 zygosity report & confusion matrix.
-        This method generates a classification report and confusion matrix for genotypes encoded as 0 (REF), 1 (HET), and 2 (ALT). If the data is determined to be haploid (only 0 and 2 present), it folds the ALT genotype (2) into HET (1) for evaluation purposes. The method computes various performance metrics, logs the classification report, and creates visualizations of the results.
+        This method generates a classification report and confusion matrix for genotypes encoded as 0 (REF), 1 (HET), and 2 (ALT). If the data is haploid (only 0 and 2 present), it folds ALT (2) into the binary ALT/PRESENT class (1) for evaluation. The method computes metrics, logs the report, and creates visualizations of the results.
         Args:
             y_true (np.ndarray): True genotypes (0/1/2) for masked
             y_pred (np.ndarray): Predicted genotypes (0/1/2) for masked
-        Raises:
-            NotFittedError: If fit() and transform() have not been called.
         """
-        labels = [0, 1, 2]
+        labels: list[int] = [0, 1, 2]
+        report_names: list[str] = ["REF", "HET", "ALT"]
         # Haploid parity: fold ALT (2) into ALT/Present (1)
         if self.is_haploid_:
-            y_true[y_true == 2] = 1
-            y_pred[y_pred == 2] = 1
-            labels = [0, 1]
-        metrics = {
-            "n_masked_test": int(y_true.size),
-            "accuracy": accuracy_score(y_true, y_pred),
-            "f1": f1_score(
-                y_true, y_pred, average="macro", labels=labels, zero_division=0
-            ),
-            "precision": precision_score(
-                y_true, y_pred, average="macro", labels=labels, zero_division=0
-            ),
-            "recall": recall_score(
-                y_true, y_pred, average="macro", labels=labels, zero_division=0
-            ),
-        }
-        self.metrics_.update({f"zygosity_{k}": v for k, v in metrics.items()})
-        report_names = ["REF", "HET"] if self.is_haploid_ else ["REF", "HET", "ALT"]
+            y_true = np.where(y_true == 2, 1, y_true)
+            y_pred = np.where(y_pred == 2, 1, y_pred)
+            labels: list[int] = [0, 1]
+            report_names: list[str] = ["REF", "ALT"]
         report: dict | str = classification_report(
             y_true,
@@ -617,50 +630,46 @@ class ImputeMostFrequent:
             self.logger.error(msg)
             raise TypeError(msg)
-        report_subset = {}
-        for k, v in report.items():
-            tmp = {}
-            if isinstance(v, dict) and "support" in v:
-                for k2, v2 in v.items():
-                    if k2 != "support":
-                        tmp[k2] = v2
-                if tmp:
-                    report_subset[k] = tmp
-        if report_subset:
-            pm = PrettyMetrics(
-                report_subset,
-                precision=3,
-                title=f"{self.model_name} Zygosity Report",
+        if self.show_plots:
+            viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
+            plots = viz.plot_all(
+                report,
+                title_prefix=f"{self.model_name} Zygosity Report",
+                show=self.show_plots,
+                heatmap_classes_only=True,
             )
-            pm.render()
-        viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
+            for name, fig in plots.items():
+                fout = self.plots_dir / f"zygosity_report_{name}.{self.plot_format}"
+                if hasattr(fig, "savefig") and isinstance(fig, Figure):
+                    fig.savefig(fout, dpi=300, facecolor="#111122")
+                    plt.close(fig)
+                elif isinstance(fig, PlotlyFigure):
+                    fig.write_html(file=fout.with_suffix(".html"))
-        plots = viz.plot_all(
-            report,
-            title_prefix=f"{self.model_name} Zygosity Report",
-            show=getattr(self, "show_plots", False),
-            heatmap_classes_only=True,
-        )
+            viz._reset_mpl_style()
-        for name, fig in plots.items():
-            fout = self.plots_dir / f"zygosity_report_{name}.{self.plot_format}"
-            if hasattr(fig, "savefig") and isinstance(fig, Figure):
-                fig.savefig(fout, dpi=300, facecolor="#111122")
-                plt.close(fig)
-            elif isinstance(fig, PlotlyFigure):
-                fig.write_html(file=fout.with_suffix(".html"))
+            # Confusion matrix
+            self.plotter_.plot_confusion_matrix(
+                y_true, y_pred, label_names=report_names, prefix="zygosity"
+            )
-        viz._reset_mpl_style()
+        # ------ Additional metrics ------
+        report_full = self._additional_metrics(
+            y_true, y_pred, labels, report_names, report
+        )
-        # Save JSON
-        self._save_report(report, suffix="zygosity")
+        if self.verbose or self.debug:
+            pm = PrettyMetrics(
+                report_full,
+                precision=2,
+                title=f"{self.model_name} Zygosity Report",
+            )
+            pm.render()
-        # Confusion matrix
-        self.plotter_.plot_confusion_matrix(
-            y_true, y_pred, label_names=report_names, prefix="zygosity"
-        )
+        # Save JSON
+        self._save_report(report_full, suffix="zygosity")
     def _evaluate_iupac10_and_plot(
         self, y_true: np.ndarray, y_pred: np.ndarray
@@ -672,32 +681,18 @@ class ImputeMostFrequent:
         Args:
             y_true (np.ndarray): True genotypes (0-9) for masked
             y_pred (np.ndarray): Predicted genotypes (0-9) for masked
-        Raises:
-            NotFittedError: If fit() and transform() have not been called.
         """
         labels_idx = list(range(10))
-        labels_names = ["A", "C", "G", "T", "W", "R", "M", "K", "Y", "S"]
-        metrics = {
-            "accuracy": accuracy_score(y_true, y_pred),
-            "f1": f1_score(
-                y_true, y_pred, average="macro", labels=labels_idx, zero_division=0
-            ),
-            "precision": precision_score(
-                y_true, y_pred, average="macro", labels=labels_idx, zero_division=0
-            ),
-            "recall": recall_score(
-                y_true, y_pred, average="macro", labels=labels_idx, zero_division=0
-            ),
-        }
-        self.metrics_.update({f"iupac_{k}": v for k, v in metrics.items()})
+        report_names = ["A", "C", "G", "T", "W", "R", "M", "K", "Y", "S"]
+        # Create an identity matrix and use the targets array as indices
+        y_score = np.eye(len(report_names))[y_pred]
         report: dict | str = classification_report(
             y_true,
             y_pred,
             labels=labels_idx,
-            target_names=labels_names,
+            target_names=report_names,
             zero_division=0,
             output_dict=True,
         )
@@ -707,54 +702,50 @@ class ImputeMostFrequent:
             self.logger.error(msg)
             raise TypeError(msg)
-        report_subset = {}
-        for k, v in report.items():
-            tmp = {}
-            if isinstance(v, dict) and "support" in v:
-                for k2, v2 in v.items():
-                    if k2 != "support":
-                        tmp[k2] = v2
-                if tmp:
-                    report_subset[k] = tmp
-        if report_subset:
-            pm = PrettyMetrics(
-                report_subset,
-                precision=3,
-                title=f"{self.model_name} IUPAC 10-Class Report",
+        if self.show_plots:
+            viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
+            plots = viz.plot_all(
+                report,
+                title_prefix=f"{self.model_name} IUPAC Report",
+                show=self.show_plots,
+                heatmap_classes_only=True,
             )
-            pm.render()
-        viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
+            # Reset the style from Optuna's plotting.
+            plt.rcParams.update(self.plotter_.param_dict)
-        plots = viz.plot_all(
-            report,
-            title_prefix=f"{self.model_name} IUPAC Report",
-            show=getattr(self, "show_plots", False),
-            heatmap_classes_only=True,
-        )
+            for name, fig in plots.items():
+                fout = self.plots_dir / f"iupac_report_{name}.{self.plot_format}"
+                if hasattr(fig, "savefig") and isinstance(fig, Figure):
+                    fig.savefig(fout, dpi=300, facecolor="#111122")
+                    plt.close(fig)
+                elif isinstance(fig, PlotlyFigure):
+                    fig.write_html(file=fout.with_suffix(".html"))
-        # Reset the style from Optuna's plotting.
-        plt.rcParams.update(self.plotter_.param_dict)
+            # Reset the style
+            viz._reset_mpl_style()
-        for name, fig in plots.items():
-            fout = self.plots_dir / f"iupac_report_{name}.{self.plot_format}"
-            if hasattr(fig, "savefig") and isinstance(fig, Figure):
-                fig.savefig(fout, dpi=300, facecolor="#111122")
-                plt.close(fig)
-            elif isinstance(fig, PlotlyFigure):
-                fig.write_html(file=fout.with_suffix(".html"))
+            # Confusion matrix
+            self.plotter_.plot_confusion_matrix(
+                y_true, y_pred, label_names=report_names, prefix="iupac"
+            )
-        # Reset the style
-        viz._reset_mpl_style()
+        # ------ Additional metrics ------
+        report_full = self._additional_metrics(
+            y_true, y_pred, labels_idx, report_names, report
+        )
-        # Save JSON
-        self._save_report(report, suffix="iupac")
+        if self.verbose or self.debug:
+            pm = PrettyMetrics(
+                report_full,
+                precision=2,
+                title=f"{self.model_name} IUPAC 10-Class Report",
+            )
+            pm.render()
-        # Confusion matrix
-        self.plotter_.plot_confusion_matrix(
-            y_true, y_pred, label_names=labels_names, prefix="iupac"
-        )
+        # Save JSON
+        self._save_report(report_full, suffix="iupac")
     def _make_train_test_split(self) -> Tuple[np.ndarray, np.ndarray]:
         """Create train/test split indices.
@@ -780,14 +771,14 @@ class ImputeMostFrequent:
             buckets = []
             for pop in np.unique(self.pops):
                 rows = np.where(self.pops == pop)[0]
-                k = int(round(self.test_size * rows.size))
+                k = max(1, int(round(self.test_size * rows.size)))
                 if k > 0:
                     buckets.append(self.rng.choice(rows, size=k, replace=False))
             test_idx = (
                 np.sort(np.concatenate(buckets)) if buckets else np.array([], dtype=int)
             )
         else:
-            k = int(round(self.test_size * n))
+            k = max(1, int(round(self.test_size * n)))
             test_idx = (
                 self.rng.choice(n, size=k, replace=False)
                 if k > 0
@@ -797,13 +788,13 @@ class ImputeMostFrequent:
         train_idx = np.setdiff1d(all_idx, test_idx, assume_unique=False)
         return train_idx, test_idx
-    def _save_report(self, report_dict: Dict[str, float], suffix: str) -> None:
+    def _save_report(self, report_dict: Dict[str, Any], suffix: str) -> None:
         """Save classification report dictionary as a JSON file.
         This method saves the provided classification report dictionary to a JSON file in the metrics directory, appending the specified suffix to the filename.
         Args:
-            report_dict (Dict[str, float]): The classification report dictionary to save.
+            report_dict (Dict[str, Any]): The classification report dictionary to save.
             suffix (str): Suffix to append to the filename (e.g., 'zygosity' or 'iupac').
         Raises:
@@ -842,3 +833,305 @@ class ImputeMostFrequent:
                 msg = f"Failed to create directory {getattr(self, f'{d}_dir')}: {e}"
                 self.logger.error(msg)
                 raise Exception(msg)
+    def decode_012(
+        self, X: np.ndarray | pd.DataFrame | list[list[int]], is_nuc: bool = False
+    ) -> np.ndarray:
+        """Decode 012-encodings to IUPAC chars with metadata repair.
+        This method converts genotype calls encoded as integers (0, 1, 2, etc.) into their corresponding IUPAC nucleotide codes. It supports two modes of decoding:
+        1. Nucleotide mode (`is_nuc=True`): Decodes integer codes (0-9) directly to IUPAC nucleotide codes.
+        2. Metadata mode (`is_nuc=False`): Uses reference and alternate allele metadata to determine the appropriate IUPAC codes. If metadata is missing or inconsistent, the method attempts to repair the decoding by scanning the source SNP data for valid IUPAC codes.
+        Args:
+            X (np.ndarray | pd.DataFrame | list[list[int]]): Input genotype calls as integers. Can be a NumPy array, Pandas DataFrame, or nested list.
+            is_nuc (bool): If True, decode 0-9 nucleotide codes; else use ref/alt metadata. Defaults to False.
+        Returns:
+            np.ndarray: IUPAC strings as a 2D array of shape (n_samples, n_snps).
+        Notes:
+            - The method normalizes input values to handle various formats, including strings, lists, and arrays.
+            - It uses a predefined mapping of IUPAC codes to nucleotide bases and vice versa.
+            - Missing or invalid codes are represented as 'N' if they can't be resolved.
+            - The method includes repair logic to infer missing metadata from the source SNP data when necessary.
+        Raises:
+            ValueError: If input is not a DataFrame.
+        """
+        df = validate_input_type(X, return_type="df")
+        if not isinstance(df, pd.DataFrame):
+            msg = f"Expected a pandas.DataFrame in 'decode_012', but got: {type(df)}."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        # IUPAC Definitions
+        iupac_to_bases: dict[str, set[str]] = {
+            "A": {"A"},
+            "C": {"C"},
+            "G": {"G"},
+            "T": {"T"},
+            "R": {"A", "G"},
+            "Y": {"C", "T"},
+            "S": {"G", "C"},
+            "W": {"A", "T"},
+            "K": {"G", "T"},
+            "M": {"A", "C"},
+            "B": {"C", "G", "T"},
+            "D": {"A", "G", "T"},
+            "H": {"A", "C", "T"},
+            "V": {"A", "C", "G"},
+            "N": set(),
+        }
+        bases_to_iupac = {
+            frozenset(v): k for k, v in iupac_to_bases.items() if k != "N"
+        }
+        missing_codes = {"", ".", "N", "NONE", "-", "?", "./.", ".|.", "NAN", "nan"}
+        def _normalize_iupac(value: object) -> str | None:
+            """Normalize an input into a single IUPAC code token or None."""
+            if value is None:
+                return None
+            # Bytes -> str (make type narrowing explicit)
+            if isinstance(value, (bytes, np.bytes_)):
+                value = bytes(value).decode("utf-8", errors="ignore")
+            # Handle list/tuple/array/Series: take first valid
+            if isinstance(value, (list, tuple, pd.Series, np.ndarray)):
+                # Convert Series to numpy array for consistent behavior
+                if isinstance(value, pd.Series):
+                    arr = value.to_numpy()
+                else:
+                    arr = value
+                # Scalar numpy array fast path
+                if isinstance(arr, np.ndarray) and arr.ndim == 0:
+                    return _normalize_iupac(arr.item())
+                # Empty sequence/array
+                if len(arr) == 0:
+                    return None
+                # First valid element wins
+                for item in arr:
+                    code = _normalize_iupac(item)
+                    if code is not None:
+                        return code
+                return None
+            s = str(value).upper().strip()
+            if not s or s in missing_codes:
+                return None
+            if "," in s:
+                for tok in (t.strip() for t in s.split(",")):
+                    if tok and tok not in missing_codes and tok in iupac_to_bases:
+                        return tok
+                return None
+            return s if s in iupac_to_bases else None
+        codes_df = df.apply(pd.to_numeric, errors="coerce")
+        codes = codes_df.fillna(-1).astype(np.int8).to_numpy()
+        n_rows, n_cols = codes.shape
+        if is_nuc:
+            iupac_list = np.array(
+                ["A", "C", "G", "T", "W", "R", "M", "K", "Y", "S"], dtype="<U1"
+            )
+            out = np.full((n_rows, n_cols), "N", dtype="<U1")
+            mask = (codes >= 0) & (codes <= 9)
+            out[mask] = iupac_list[codes[mask]]
+            return out
+        # Metadata fetch
+        ref_alleles = getattr(self.genotype_data, "ref", [])
+        alt_alleles = getattr(self.genotype_data, "alt", [])
+        if len(ref_alleles) != n_cols:
+            ref_alleles = getattr(self, "_ref", [None] * n_cols)
+        if len(alt_alleles) != n_cols:
+            alt_alleles = getattr(self, "_alt", [None] * n_cols)
+        # Ensure list length matches
+        if len(ref_alleles) != n_cols:
+            ref_alleles = [None] * n_cols
+        if len(alt_alleles) != n_cols:
+            alt_alleles = [None] * n_cols
+        out = np.full((n_rows, n_cols), "N", dtype="<U1")
+        source_snp_data = None
+        for j in range(n_cols):
+            ref = _normalize_iupac(ref_alleles[j])
+            alt = _normalize_iupac(alt_alleles[j])
+            # --- REPAIR LOGIC ---
+            # If metadata is missing, scan the source column.
+            if ref is None or alt is None:
+                if source_snp_data is None and self.genotype_data.snp_data is not None:
+                    try:
+                        source_snp_data = np.asarray(self.genotype_data.snp_data)
+                    except Exception:
+                        pass  # if lazy loading fails
+                if source_snp_data is not None:
+                    try:
+                        col_data = source_snp_data[:, j]
+                        uniques = set()
+                        # Optimization: check up to 200 non-empty values
+                        count = 0
+                        for val in col_data:
+                            norm = _normalize_iupac(val)
+                            if norm:
+                                uniques.add(norm)
+                                count += 1
+                            if len(uniques) >= 2 or count > 200:
+                                break
+                        sorted_u = sorted(list(uniques))
+                        if len(sorted_u) >= 1 and ref is None:
+                            ref = sorted_u[0]
+                        if len(sorted_u) >= 2 and alt is None:
+                            alt = sorted_u[1]
+                    except Exception:
+                        pass
+            # --- DEFAULTS FOR MISSING ---
+            # If still missing, we cannot decode.
+            if ref is None and alt is None:
+                ref = "N"
+                alt = "N"
+            elif ref is None:
+                ref = alt
+            elif alt is None:
+                alt = ref  # Monomorphic site: ALT becomes REF
+            # --- COMPUTE HET CODE ---
+            if ref == alt:
+                het_code = ref
+            else:
+                ref_set = iupac_to_bases.get(ref, set()) if ref is not None else set()
+                alt_set = iupac_to_bases.get(alt, set()) if alt is not None else set()
+                union_set = frozenset(ref_set | alt_set)
+                het_code = bases_to_iupac.get(union_set, "N")
+            # --- ASSIGNMENT WITH SAFETY FALLBACKS ---
+            col_codes = codes[:, j]
+            # Case 0: REF
+            if ref != "N":
+                out[col_codes == 0, j] = ref
+            # Case 1: HET
+            if het_code != "N":
+                out[col_codes == 1, j] = het_code
+            else:
+                # If HET code is invalid (e.g. ref='A', alt='N'),
+                # fallback to REF
+                # Fix for an issue where a HET prediction at a monomorphic site
+                # produced 'N'
+                if ref != "N":
+                    out[col_codes == 1, j] = ref
+            # Case 2: ALT
+            if alt != "N":
+                out[col_codes == 2, j] = alt
+            else:
+                # If ALT is invalid (e.g. ref='A', alt='N'), fallback to REF
+                # Fix for an issue where an ALT prediction on a monomorphic site
+                # produced 'N'
+                if ref != "N":
+                    out[col_codes == 2, j] = ref
+        return out
+    def _additional_metrics(
+        self,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        labels: list[int],
+        report_names: list[str],
+        report: dict[str, dict[str, float] | float],
+    ) -> dict[str, dict[str, float] | float]:
+        """Compute additional metrics and augment the report dictionary.
+        Args:
+            y_true (np.ndarray): True genotypes.
+            y_pred (np.ndarray): Predicted genotypes.
+            labels (list[int]): List of label indices.
+            report_names (list[str]): List of report names corresponding to labels.
+            report (dict[str, dict[str, float] | float]): Classification report dictionary to augment.
+        Returns:
+            dict[str, dict[str, float] | float]: Augmented report dictionary with additional metrics.
+        """
+        # Create an identity matrix and use the targets array as indices
+        y_score = np.eye(len(report_names))[y_pred]
+        # Per-class metrics
+        ap_pc = average_precision_score(y_true, y_score, average=None)
+        jaccard_pc = jaccard_score(
+            y_true, y_pred, average=None, labels=labels, zero_division=0
+        )
+        # Macro/weighted metrics
+        ap_macro = average_precision_score(y_true, y_score, average="macro")
+        ap_weighted = average_precision_score(y_true, y_score, average="weighted")
+        jaccard_macro = jaccard_score(y_true, y_pred, average="macro", zero_division=0)
+        jaccard_weighted = jaccard_score(
+            y_true, y_pred, average="weighted", zero_division=0
+        )
+        # Matthews correlation coefficient (MCC)
+        mcc = matthews_corrcoef(y_true, y_pred)
+        if not isinstance(ap_pc, np.ndarray):
+            msg = "average_precision_score or f1_score did not return np.ndarray as expected."
+            self.logger.error(msg)
+            raise TypeError(msg)
+        if not isinstance(jaccard_pc, np.ndarray):
+            msg = "jaccard_score did not return np.ndarray as expected."
+            self.logger.error(msg)
+            raise TypeError(msg)
+        # Add per-class metrics
+        report_full = {}
+        dd_subset = {
+            k: v for k, v in report.items() if k in report_names and isinstance(v, dict)
+        }
+        for i, class_name in enumerate(report_names):
+            class_report: dict[str, float] = {}
+            if class_name in dd_subset:
+                class_report = dd_subset[class_name]
+            if isinstance(class_report, float) or not class_report:
+                continue
+            report_full[class_name] = dict(class_report)
+            report_full[class_name]["average-precision"] = float(ap_pc[i])
+            report_full[class_name]["jaccard"] = float(jaccard_pc[i])
+        macro_avg = report.get("macro avg")
+        if isinstance(macro_avg, dict):
+            report_full["macro avg"] = dict(macro_avg)
+            report_full["macro avg"]["average-precision"] = float(ap_macro)
+            report_full["macro avg"]["jaccard"] = float(jaccard_macro)
+        weighted_avg = report.get("weighted avg")
+        if isinstance(weighted_avg, dict):
+            report_full["weighted avg"] = dict(weighted_avg)
+            report_full["weighted avg"]["average-precision"] = float(ap_weighted)
+            report_full["weighted avg"]["jaccard"] = float(jaccard_weighted)
+        # Add scalar summary metrics
+        report_full["mcc"] = float(mcc)
+        accuracy_val = report.get("accuracy")
+        if isinstance(accuracy_val, (int, float)):
+            report_full["accuracy"] = float(accuracy_val)
+        return report_full

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl