PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/impute/deterministic/imputers/ref_allele.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # Standard library
+import copy
 import json
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
@@ -11,16 +12,16 @@ from matplotlib.figure import Figure
 from plotly.graph_objs import Figure as PlotlyFigure
 from sklearn.exceptions import NotFittedError
 from sklearn.metrics import (
-    accuracy_score,
+    average_precision_score,
     classification_report,
-    f1_score,
-    precision_score,
-    recall_score,
+    jaccard_score,
+    matthews_corrcoef,
 )
 # Project
 from snpio import GenotypeEncoder
 from snpio.utils.logging import LoggerManager
+from snpio.utils.misc import validate_input_type
 from pgsui.data_processing.config import apply_dot_overrides, load_yaml_to_dataclass
 from pgsui.data_processing.containers import RefAlleleConfig
@@ -58,6 +59,7 @@ def ensure_refallele_config(
     if isinstance(config, str):
         return load_yaml_to_dataclass(config, RefAlleleConfig)
     if isinstance(config, dict):
+        config = copy.deepcopy(config)  # copy
         base = RefAlleleConfig()
         # honor optional top-level 'preset'
         preset = config.pop("preset", None)
@@ -82,9 +84,9 @@ def ensure_refallele_config(
 class ImputeRefAllele:
-    """Deterministic imputer that replaces all missing 0/1/2 genotype values with the REF genotype (0).
+    """Deterministic imputer that fills missing genotypes with REF (0).
-    The imputer works on 0/1/2 with -1 as missing. Evaluation splits samples into TRAIN/TEST once. Masks ALL originally observed cells on TEST rows for eval. Produces: 0/1/2 (zygosity) classification report + confusion matrix 10-class IUPAC classification report (via decode_012) + confusion matrix. Plots genotype distribution before/after imputation.
+    Operates on 0/1/2 encodings with missing values represented by any negative integer. Evaluation splits samples into TRAIN/TEST once, then evaluates on either all observed test cells or a simulated-missing subset (depending on config). Produces 0/1/2 (zygosity) and 10-class IUPAC reports plus confusion matrices, and plots genotype distributions before/after imputation. Output is returned as IUPAC strings via ``decode_012``.
     """
     def __init__(
@@ -107,16 +109,16 @@ class ImputeRefAllele:
     ) -> None:
         """Initialize the Ref-Allele imputer from a unified config.
-        This constructor ensures that the provided configuration is valid and initializes the imputer's internal state. It sets up logging, random number generation, genotype encoding, and various parameters based on the configuration. The imputer is prepared to handle population-specific modes if specified in the configuration.
+        This constructor ensures that the provided configuration is valid and initializes the imputer's internal state. It sets up logging, random number generation, genotype encoding, and simulated-missing controls.
         Args:
             genotype_data (GenotypeData): Backing genotype data.
-            tree_parser (Optional[TreeParser]): Optional SNPio phylogenetic tree parser for population-specific modes.
+            tree_parser (Optional[TreeParser]): Optional SNPio tree parser for nonrandom simulated-missing modes.
             config (RefAlleleConfig | dict | str | None): Configuration as a dataclass, nested dict, or YAML path. If None, defaults are used.
-            overrides (dict | None): Flat dot-key overrides applied last with highest precedence, e.g. {'split.test_size': 0.25, 'algo.missing': -1}.
+            overrides (Optional[dict]): Flat dot-key overrides applied last with highest precedence, e.g. {'split.test_size': 0.25, 'algo.missing': -1}.
             simulate_missing (bool): Whether to simulate missing data during evaluation. Default is True.
-            sim_strategy (Literal): Strategy for simulating missing data if enabled in config.
-            sim_prop (float): Proportion of data to simulate as missing if enabled in config.
+            sim_strategy (Literal["random", "random_weighted", "random_weighted_inv", "nonrandom", "nonrandom_weighted"]): Strategy for simulating missing data if enabled in config.
+            sim_prop (float): Proportion of data to simulate as missing if enabled in config. Default is 0.2.
             sim_kwargs (Optional[dict]): Additional keyword arguments for the simulated missing data transformer.
         """
         # Normalize config then apply highest-precedence overrides
@@ -153,7 +155,7 @@ class ImputeRefAllele:
         self.plots_dir: Path
         self.metrics_dir: Path
         self.parameters_dir: Path
-        self.model_dir: Path
+        self.models_dir: Path
         self.optimize_dir: Path
         # Logger
@@ -174,7 +176,7 @@ class ImputeRefAllele:
         self.encoder = GenotypeEncoder(self.genotype_data)
         # Work in 0/1/2 with -1 for missing
-        X012 = self.encoder.genotypes_012.astype(np.int16, copy=True)
+        X012 = self.encoder.genotypes_012.astype(np.int8, copy=True)
         X012[X012 < 0] = -1
         self.X012_ = X012
         self.num_features_ = X012.shape[1]
@@ -199,8 +201,8 @@ class ImputeRefAllele:
         self.metrics_: Dict[str, int | float] = {}
         # Ploidy heuristic for 0/1/2 scoring parity
-        uniq = np.unique(self.X012_[self.X012_ != -1])
-        self.is_haploid_ = np.array_equal(np.sort(uniq), np.array([0, 2]))
+        self.ploidy = self.cfg.io.ploidy
+        self.is_haploid_ = self.ploidy == 1
         # Plotting (use config)
         self.plot_format = cfg.plot.fmt
@@ -243,8 +245,7 @@ class ImputeRefAllele:
         # Use NaN for missing inside a DataFrame to leverage fillna
         df_all = pd.DataFrame(self.ground_truth012_, dtype=np.float32)
-        df_all = df_all.replace(self.missing, np.nan)
-        df_all = df_all.replace(-9, np.nan)  # Just in case
+        df_all[df_all < 0] = np.nan
         # Observed mask in the ORIGINAL data (before any simulated-missing)
         obs_mask = df_all.notna().to_numpy()  # shape (n_samples, n_loci)
@@ -256,6 +257,9 @@ class ImputeRefAllele:
         # Decide how to build the sim mask: legacy vs simulated-missing
         if getattr(self, "simulate_missing", False):
+            X_for_sim = self.ground_truth012_.astype(np.float32, copy=True)
+            X_for_sim[X_for_sim < 0] = -9.0
             # Simulate missing on the full matrix; we only use the mask.
             tr = SimMissingTransformer(
                 genotype_data=self.genotype_data,
@@ -267,7 +271,7 @@ class ImputeRefAllele:
                 verbose=self.verbose,
                 **(self.sim_kwargs or {}),
             )
-            tr.fit(self.ground_truth012_.copy())
+            tr.fit(X_for_sim)
             sim_mask_global = tr.sim_missing_mask_.astype(bool)
             # Only consider cells that were originally observed
@@ -317,12 +321,17 @@ class ImputeRefAllele:
             NotFittedError: If the model has not been fitted yet.
         """
         if not self.is_fit_:
-            raise NotFittedError("Model is not fitted. Call fit() before transform().")
-        assert self.X_train_df_ is not None
+            msg = "ImputeRefAllele instance is not fitted yet. Call 'fit()' before 'transform()'."
+            self.logger.error(msg)
+            raise NotFittedError(msg)
+        assert (
+            self.X_train_df_ is not None
+        ), f"[{self.model_name}] X_train_df_ is not set after fit()."
         # 1) Impute the evaluation-masked copy (compute metrics)
         imputed_eval_df = self._impute_ref(df_in=self.X_train_df_)
-        X_imputed_eval = imputed_eval_df.to_numpy(dtype=np.int16)
+        X_imputed_eval = imputed_eval_df.to_numpy(dtype=np.int8)
         self.X_imputed012_ = X_imputed_eval
         # Evaluate parity with DL models
@@ -330,23 +339,24 @@ class ImputeRefAllele:
         # 2) Impute the FULL dataset (only true missings)
         df_missingonly = pd.DataFrame(self.ground_truth012_, dtype=np.float32)
-        df_missingonly = df_missingonly.replace(self.missing, np.nan)
-        df_missingonly = df_missingonly.replace(-9, np.nan)  # Just in case
+        df_missingonly[df_missingonly < 0] = np.nan
         imputed_full_df = self._impute_ref(df_in=df_missingonly)
-        X_imputed_full_012 = imputed_full_df.to_numpy(dtype=np.int16)
+        X_imputed_full_012 = imputed_full_df.to_numpy(dtype=np.int8)
         # Plot distributions (like DL .transform())
         if self.ground_truth012_ is None:
-            msg = "ground_truth012_ is None; cannot plot distributions."
-            self.logger.error(msg)
+            msg = "ground_truth012_ is NoneType; cannot plot distributions."
+            self.logger.error(msg, exc_info=True)
+            raise NotFittedError(msg)
+        imp_decoded = self.decode_012(X_imputed_full_012)
-            raise NotFittedError("ground_truth012_ is None; cannot plot distributions.")
-        gt_decoded = self.encoder.decode_012(self.ground_truth012_)
-        imp_decoded = self.encoder.decode_012(X_imputed_full_012)
-        self.plotter_.plot_gt_distribution(gt_decoded, is_imputed=False)
-        self.plotter_.plot_gt_distribution(imp_decoded, is_imputed=True)
+        if self.show_plots:
+            gt_decoded = self.decode_012(self.ground_truth012_)
+            self.plotter_.plot_gt_distribution(gt_decoded, is_imputed=False)
+            self.plotter_.plot_gt_distribution(imp_decoded, is_imputed=True)
         # Return IUPAC strings
         return imp_decoded
@@ -365,7 +375,7 @@ class ImputeRefAllele:
         df = df_in.copy()
         # Fill all NaNs with 0 (homozygous REF) column-wise; constant so vectorized is fine
         df = df.fillna(0)
-        return df.astype(np.int16)
+        return df.astype(np.int8)
     def _evaluate_and_report(self) -> None:
         """Evaluate imputed vs. ground truth on masked test cells; produce reports and plots.
@@ -394,8 +404,8 @@ class ImputeRefAllele:
         X_pred_eval = self.ground_truth012_.copy()
         X_pred_eval[self.sim_mask_] = self.X_imputed012_[self.sim_mask_]
-        y_true_dec = self.encoder.decode_012(self.ground_truth012_)
-        y_pred_dec = self.encoder.decode_012(X_pred_eval)
+        y_true_dec = self.decode_012(self.ground_truth012_)
+        y_pred_dec = self.decode_012(X_pred_eval)
         encodings_dict = {
             "A": 0,
@@ -418,43 +428,37 @@ class ImputeRefAllele:
         )
         y_true_10 = y_true_int[self.sim_mask_]
         y_pred_10 = y_pred_int[self.sim_mask_]
+        m = (y_true_10 >= 0) & (y_pred_10 >= 0)
+        y_true_10, y_pred_10 = y_true_10[m], y_pred_10[m]
+        if y_true_10.size == 0:
+            self.logger.warning(
+                "No valid IUPAC test cells; skipping 10-class evaluation."
+            )
+            return
         self._evaluate_iupac10_and_plot(y_true_10, y_pred_10)
     def _evaluate_012_and_plot(self, y_true: np.ndarray, y_pred: np.ndarray) -> None:
         """0/1/2 zygosity report & confusion matrix.
-        This method generates a classification report and confusion matrix for genotypes encoded as 0 (REF), 1 (HET), and 2 (ALT). If the data is determined to be haploid (only 0 and 2 present), it folds the ALT genotype (2) into HET (1) for evaluation purposes. The method computes various performance metrics, logs the classification report, and creates visualizations of the results.
+        This method generates a classification report and confusion matrix for genotypes encoded as 0 (REF), 1 (HET), and 2 (ALT). If the data is haploid (only 0 and 2 present), it folds ALT (2) into the binary ALT/PRESENT class (1) for evaluation. The method computes metrics, logs the report, and creates visualizations of the results.
         Args:
             y_true (np.ndarray): True genotypes (0/1/2) for masked
-            y_pred (np.ndarray): Predicted genotypes (0/1/2) for
+            y_pred (np.ndarray): Predicted genotypes (0/1/2) for masked
         """
-        labels = [0, 1, 2]
-        report_names = ["REF", "HET", "ALT"]
+        labels: list[int] = [0, 1, 2]
+        report_names: list[str] = ["REF", "HET", "ALT"]
-        # Haploid parity: fold 2 -> 1
+        # Haploid parity: fold ALT (2) into ALT/Present (1)
         if self.is_haploid_:
-            y_true[y_true == 2] = 1
-            y_pred[y_pred == 2] = 1
-            labels = [0, 1]
-            report_names = ["REF", "ALT"]
-        metrics = {
-            "n_masked_test": int(y_true.size),
-            "accuracy": accuracy_score(y_true, y_pred),
-            "f1": f1_score(
-                y_true, y_pred, average="weighted", labels=labels, zero_division=0
-            ),
-            "precision": precision_score(
-                y_true, y_pred, average="weighted", labels=labels, zero_division=0
-            ),
-            "recall": recall_score(
-                y_true, y_pred, average="weighted", labels=labels, zero_division=0
-            ),
-        }
-        self.metrics_.update({f"zygosity_{k}": v for k, v in metrics.items()})
+            y_true = np.where(y_true == 2, 1, y_true)
+            y_pred = np.where(y_pred == 2, 1, y_pred)
+            labels: list[int] = [0, 1]
+            report_names: list[str] = ["REF", "ALT"]
-        report: str | dict = classification_report(
+        report: dict | str = classification_report(
             y_true,
             y_pred,
             labels=labels,
@@ -468,91 +472,69 @@ class ImputeRefAllele:
             self.logger.error(msg)
             raise TypeError(msg)
-        report_subset = {}
-        for k, v in report.items():
-            tmp = {}
-            if isinstance(v, dict) and "support" in v:
-                for k2, v2 in v.items():
-                    if k2 != "support":
-                        tmp[k2] = v2
-                if tmp:
-                    report_subset[k] = tmp
-        if report_subset:
-            pm = PrettyMetrics(
-                report_subset,
-                precision=3,
-                title=f"{self.model_name} Zygosity Report",
-            )
-            pm.render()
-        viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
+        if self.show_plots:
+            viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
-        if not isinstance(report, dict):
-            msg = "classification_report did not return a dict as expected."
-            self.logger.error(msg)
-            raise TypeError(msg)
+            plots = viz.plot_all(
+                report,
+                title_prefix=f"{self.model_name} Zygosity Report",
+                show=self.show_plots,
+                heatmap_classes_only=True,
+            )
-        plots = viz.plot_all(
-            report,
-            title_prefix=f"{self.model_name} Zygosity Report",
-            show=getattr(self, "show_plots", False),
-            heatmap_classes_only=True,
-        )
+            for name, fig in plots.items():
+                fout = self.plots_dir / f"zygosity_report_{name}.{self.plot_format}"
+                if hasattr(fig, "savefig") and isinstance(fig, Figure):
+                    fig.savefig(fout, dpi=300, facecolor="#111122")
+                    plt.close(fig)
+                elif isinstance(fig, PlotlyFigure):
+                    fig.write_html(file=fout.with_suffix(".html"))
-        # Reset the style from Optuna's plotting.
-        plt.rcParams.update(self.plotter_.param_dict)
+            viz._reset_mpl_style()
-        for name, fig in plots.items():
-            fout = self.plots_dir / f"zygosity_report_{name}.{self.plot_format}"
-            if hasattr(fig, "savefig") and isinstance(fig, Figure):
-                fig.savefig(fout, dpi=300, facecolor="#111122")
-                plt.close(fig)
-            elif isinstance(fig, PlotlyFigure):
-                fig.write_html(file=fout.with_suffix(".html"))
+            # Confusion matrix
+            self.plotter_.plot_confusion_matrix(
+                y_true, y_pred, label_names=report_names, prefix="zygosity"
+            )
-        viz._reset_mpl_style()
+        # ------ Additional metrics ------
+        report_full = self._additional_metrics(
+            y_true, y_pred, labels, report_names, report
+        )
-        self._save_report(report, suffix="zygosity")
+        if self.verbose or self.debug:
+            pm = PrettyMetrics(
+                report_full,
+                precision=2,
+                title=f"{self.model_name} Zygosity Report",
+            )
+            pm.render()
-        # Confusion matrix
-        self.plotter_.plot_confusion_matrix(
-            y_true, y_pred, label_names=report_names, prefix="zygosity"
-        )
+        # Save JSON
+        self._save_report(report_full, suffix="zygosity")
     def _evaluate_iupac10_and_plot(
         self, y_true: np.ndarray, y_pred: np.ndarray
     ) -> None:
         """10-class IUPAC report & confusion matrix.
-        This method generates a classification report and confusion matrix for genotypes encoded using the 10 IUPAC codes (0-9). The IUPAC codes represent various nucleotide combinations, including ambiguous bases.
+        This method generates a classification report and confusion matrix for genotypes encoded as 10-class IUPAC codes (0-9). It computes various performance metrics, logs the classification report, and creates visualizations of the results.
         Args:
-            y_true (np.ndarray): True genotypes (0-9) for masked test cells.
-            y_pred (np.ndarray): Predicted genotypes (0-9) for masked test cells.
+            y_true (np.ndarray): True genotypes (0-9) for masked
+            y_pred (np.ndarray): Predicted genotypes (0-9) for masked
         """
         labels_idx = list(range(10))
-        labels_names = ["A", "C", "G", "T", "W", "R", "M", "K", "Y", "S"]
-        metrics = {
-            "accuracy": accuracy_score(y_true, y_pred),
-            "f1": f1_score(
-                y_true, y_pred, average="weighted", labels=labels_idx, zero_division=0
-            ),
-            "precision": precision_score(
-                y_true, y_pred, average="weighted", labels=labels_idx, zero_division=0
-            ),
-            "recall": recall_score(
-                y_true, y_pred, average="weighted", labels=labels_idx, zero_division=0
-            ),
-        }
-        self.metrics_.update({f"iupac_{k}": v for k, v in metrics.items()})
+        report_names = ["A", "C", "G", "T", "W", "R", "M", "K", "Y", "S"]
-        report = classification_report(
+        # Create an identity matrix and use the targets array as indices
+        y_score = np.eye(len(report_names))[y_pred]
+        report: dict | str = classification_report(
             y_true,
             y_pred,
             labels=labels_idx,
-            target_names=labels_names,
+            target_names=report_names,
             zero_division=0,
             output_dict=True,
         )
@@ -562,30 +544,50 @@ class ImputeRefAllele:
             self.logger.error(msg)
             raise TypeError(msg)
-        report_subset = {}
-        for k, v in report.items():
-            tmp = {}
-            if isinstance(v, dict) and "support" in v:
-                for k2, v2 in v.items():
-                    if k2 != "support":
-                        tmp[k2] = v2
-                if tmp:
-                    report_subset[k] = tmp
-        if report_subset:
+        if self.show_plots:
+            viz = ClassificationReportVisualizer(reset_kwargs=self.plotter_.param_dict)
+            plots = viz.plot_all(
+                report,
+                title_prefix=f"{self.model_name} IUPAC Report",
+                show=self.show_plots,
+                heatmap_classes_only=True,
+            )
+            # Reset the style from Optuna's plotting.
+            plt.rcParams.update(self.plotter_.param_dict)
+            for name, fig in plots.items():
+                fout = self.plots_dir / f"iupac_report_{name}.{self.plot_format}"
+                if hasattr(fig, "savefig") and isinstance(fig, Figure):
+                    fig.savefig(fout, dpi=300, facecolor="#111122")
+                    plt.close(fig)
+                elif isinstance(fig, PlotlyFigure):
+                    fig.write_html(file=fout.with_suffix(".html"))
+            # Reset the style
+            viz._reset_mpl_style()
+            # Confusion matrix
+            self.plotter_.plot_confusion_matrix(
+                y_true, y_pred, label_names=report_names, prefix="iupac"
+            )
+        # ------ Additional metrics ------
+        report_full = self._additional_metrics(
+            y_true, y_pred, labels_idx, report_names, report
+        )
+        if self.verbose or self.debug:
             pm = PrettyMetrics(
-                report_subset,
-                precision=3,
+                report_full,
+                precision=2,
                 title=f"{self.model_name} IUPAC 10-Class Report",
             )
             pm.render()
-        self._save_report(report, suffix="iupac")
-        # Confusion matrix
-        self.plotter_.plot_confusion_matrix(
-            y_true, y_pred, label_names=labels_names, prefix="iupac"
-        )
+        # Save JSON
+        self._save_report(report_full, suffix="iupac")
     def _make_train_test_split(self) -> Tuple[np.ndarray, np.ndarray]:
         """Create train/test split indices.
@@ -623,25 +625,28 @@ class ImputeRefAllele:
         train_idx = np.setdiff1d(all_idx, test_idx, assume_unique=False)
         return train_idx, test_idx
-    def _save_report(self, report_dict: Dict[str, float], suffix: str) -> None:
+    def _save_report(self, report_dict: Dict[str, Any], suffix: str) -> None:
         """Save classification report dictionary as a JSON file.
-        This method saves the provided classification report dictionary to a JSON file in the metrics directory. The filename includes a suffix to distinguish between different types of reports (e.g., 'zygosity' or 'iupac').
+        This method saves the provided classification report dictionary to a JSON file in the metrics directory, appending the specified suffix to the filename.
         Args:
-            report_dict (Dict[str, float]): The classification report dictionary to save.
+            report_dict (Dict[str, Any]): The classification report dictionary to save.
             suffix (str): Suffix to append to the filename (e.g., 'zygosity' or 'iupac').
         Raises:
             NotFittedError: If fit() and transform() have not been called.
         """
         if not self.is_fit_ or self.X_imputed012_ is None:
-            raise NotFittedError("No report to save. Ensure fit() and transform() ran.")
+            msg = "No report to save. Ensure fit() and transform() have been called."
+            raise NotFittedError(msg)
         out_fp = self.metrics_dir / f"classification_report_{suffix}.json"
         with open(out_fp, "w") as f:
             json.dump(report_dict, f, indent=4)
-        self.logger.info(f"{self.model_name} {suffix} report saved to {out_fp}.")
+        msg = f"{self.model_name} {suffix} report saved to {out_fp}."
+        self.logger.info(msg)
     def _create_model_directories(self, prefix: str, outdirs: List[str]) -> None:
         """Creates the directory structure for storing model outputs.
@@ -667,3 +672,305 @@ class ImputeRefAllele:
                 msg = f"Failed to create directory {getattr(self, f'{d}_dir')}: {e}"
                 self.logger.error(msg)
                 raise Exception(msg)
+    def decode_012(
+        self, X: np.ndarray | pd.DataFrame | list[list[int]], is_nuc: bool = False
+    ) -> np.ndarray:
+        """Decode 012-encodings to IUPAC chars with metadata repair.
+        This method converts genotype calls encoded as integers (0, 1, 2, etc.) into their corresponding IUPAC nucleotide codes. It supports two modes of decoding:
+        1. Nucleotide mode (`is_nuc=True`): Decodes integer codes (0-9) directly to IUPAC nucleotide codes.
+        2. Metadata mode (`is_nuc=False`): Uses reference and alternate allele metadata to determine the appropriate IUPAC codes. If metadata is missing or inconsistent, the method attempts to repair the decoding by scanning the source SNP data for valid IUPAC codes.
+        Args:
+            X (np.ndarray | pd.DataFrame | list[list[int]]): Input genotype calls as integers. Can be a NumPy array, Pandas DataFrame, or nested list.
+            is_nuc (bool): If True, decode 0-9 nucleotide codes; else use ref/alt metadata. Defaults to False.
+        Returns:
+            np.ndarray: IUPAC strings as a 2D array of shape (n_samples, n_snps).
+        Notes:
+            - The method normalizes input values to handle various formats, including strings, lists, and arrays.
+            - It uses a predefined mapping of IUPAC codes to nucleotide bases and vice versa.
+            - Missing or invalid codes are represented as 'N' if they can't be resolved.
+            - The method includes repair logic to infer missing metadata from the source SNP data when necessary.
+        Raises:
+            ValueError: If input is not a DataFrame.
+        """
+        df = validate_input_type(X, return_type="df")
+        if not isinstance(df, pd.DataFrame):
+            msg = f"Expected a pandas.DataFrame in 'decode_012', but got: {type(df)}."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        # IUPAC Definitions
+        iupac_to_bases: dict[str, set[str]] = {
+            "A": {"A"},
+            "C": {"C"},
+            "G": {"G"},
+            "T": {"T"},
+            "R": {"A", "G"},
+            "Y": {"C", "T"},
+            "S": {"G", "C"},
+            "W": {"A", "T"},
+            "K": {"G", "T"},
+            "M": {"A", "C"},
+            "B": {"C", "G", "T"},
+            "D": {"A", "G", "T"},
+            "H": {"A", "C", "T"},
+            "V": {"A", "C", "G"},
+            "N": set(),
+        }
+        bases_to_iupac = {
+            frozenset(v): k for k, v in iupac_to_bases.items() if k != "N"
+        }
+        missing_codes = {"", ".", "N", "NONE", "-", "?", "./.", ".|.", "NAN", "nan"}
+        def _normalize_iupac(value: object) -> str | None:
+            """Normalize an input into a single IUPAC code token or None."""
+            if value is None:
+                return None
+            # Bytes -> str (make type narrowing explicit)
+            if isinstance(value, (bytes, np.bytes_)):
+                value = bytes(value).decode("utf-8", errors="ignore")
+            # Handle list/tuple/array/Series: take first valid
+            if isinstance(value, (list, tuple, pd.Series, np.ndarray)):
+                # Convert Series to numpy array for consistent behavior
+                if isinstance(value, pd.Series):
+                    arr = value.to_numpy()
+                else:
+                    arr = value
+                # Scalar numpy array fast path
+                if isinstance(arr, np.ndarray) and arr.ndim == 0:
+                    return _normalize_iupac(arr.item())
+                # Empty sequence/array
+                if len(arr) == 0:
+                    return None
+                # First valid element wins
+                for item in arr:
+                    code = _normalize_iupac(item)
+                    if code is not None:
+                        return code
+                return None
+            s = str(value).upper().strip()
+            if not s or s in missing_codes:
+                return None
+            if "," in s:
+                for tok in (t.strip() for t in s.split(",")):
+                    if tok and tok not in missing_codes and tok in iupac_to_bases:
+                        return tok
+                return None
+            return s if s in iupac_to_bases else None
+        codes_df = df.apply(pd.to_numeric, errors="coerce")
+        codes = codes_df.fillna(-1).astype(np.int8).to_numpy()
+        n_rows, n_cols = codes.shape
+        if is_nuc:
+            iupac_list = np.array(
+                ["A", "C", "G", "T", "W", "R", "M", "K", "Y", "S"], dtype="<U1"
+            )
+            out = np.full((n_rows, n_cols), "N", dtype="<U1")
+            mask = (codes >= 0) & (codes <= 9)
+            out[mask] = iupac_list[codes[mask]]
+            return out
+        # Metadata fetch
+        ref_alleles = getattr(self.genotype_data, "ref", [])
+        alt_alleles = getattr(self.genotype_data, "alt", [])
+        if len(ref_alleles) != n_cols:
+            ref_alleles = getattr(self, "_ref", [None] * n_cols)
+        if len(alt_alleles) != n_cols:
+            alt_alleles = getattr(self, "_alt", [None] * n_cols)
+        # Ensure list length matches
+        if len(ref_alleles) != n_cols:
+            ref_alleles = [None] * n_cols
+        if len(alt_alleles) != n_cols:
+            alt_alleles = [None] * n_cols
+        out = np.full((n_rows, n_cols), "N", dtype="<U1")
+        source_snp_data = None
+        for j in range(n_cols):
+            ref = _normalize_iupac(ref_alleles[j])
+            alt = _normalize_iupac(alt_alleles[j])
+            # --- REPAIR LOGIC ---
+            # If metadata is missing, scan the source column.
+            if ref is None or alt is None:
+                if source_snp_data is None and self.genotype_data.snp_data is not None:
+                    try:
+                        source_snp_data = np.asarray(self.genotype_data.snp_data)
+                    except Exception:
+                        pass  # if lazy loading fails
+                if source_snp_data is not None:
+                    try:
+                        col_data = source_snp_data[:, j]
+                        uniques = set()
+                        # Optimization: check up to 200 non-empty values
+                        count = 0
+                        for val in col_data:
+                            norm = _normalize_iupac(val)
+                            if norm:
+                                uniques.add(norm)
+                                count += 1
+                            if len(uniques) >= 2 or count > 200:
+                                break
+                        sorted_u = sorted(list(uniques))
+                        if len(sorted_u) >= 1 and ref is None:
+                            ref = sorted_u[0]
+                        if len(sorted_u) >= 2 and alt is None:
+                            alt = sorted_u[1]
+                    except Exception:
+                        pass
+            # --- DEFAULTS FOR MISSING ---
+            # If still missing, we cannot decode.
+            if ref is None and alt is None:
+                ref = "N"
+                alt = "N"
+            elif ref is None:
+                ref = alt
+            elif alt is None:
+                alt = ref  # Monomorphic site: ALT becomes REF
+            # --- COMPUTE HET CODE ---
+            if ref == alt:
+                het_code = ref
+            else:
+                ref_set = iupac_to_bases.get(ref, set()) if ref is not None else set()
+                alt_set = iupac_to_bases.get(alt, set()) if alt is not None else set()
+                union_set = frozenset(ref_set | alt_set)
+                het_code = bases_to_iupac.get(union_set, "N")
+            # --- ASSIGNMENT WITH SAFETY FALLBACKS ---
+            col_codes = codes[:, j]
+            # Case 0: REF
+            if ref != "N":
+                out[col_codes == 0, j] = ref
+            # Case 1: HET
+            if het_code != "N":
+                out[col_codes == 1, j] = het_code
+            else:
+                # If HET code is invalid (e.g. ref='A', alt='N'),
+                # fallback to REF
+                # Fix for an issue where a HET prediction at a monomorphic site
+                # produced 'N'
+                if ref != "N":
+                    out[col_codes == 1, j] = ref
+            # Case 2: ALT
+            if alt != "N":
+                out[col_codes == 2, j] = alt
+            else:
+                # If ALT is invalid (e.g. ref='A', alt='N'), fallback to REF
+                # Fix for an issue where an ALT prediction on a monomorphic site
+                # produced 'N'
+                if ref != "N":
+                    out[col_codes == 2, j] = ref
+        return out
+    def _additional_metrics(
+        self,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        labels: list[int],
+        report_names: list[str],
+        report: dict[str, dict[str, float] | float],
+    ) -> dict[str, dict[str, float] | float]:
+        """Compute additional metrics and augment the report dictionary.
+        Args:
+            y_true (np.ndarray): True genotypes.
+            y_pred (np.ndarray): Predicted genotypes.
+            labels (list[int]): List of label indices.
+            report_names (list[str]): List of report names corresponding to labels.
+            report (dict[str, dict[str, float] | float]): Classification report dictionary to augment.
+        Returns:
+            dict[str, dict[str, float] | float]: Augmented report dictionary with additional metrics.
+        """
+        # Create an identity matrix and use the targets array as indices
+        y_score = np.eye(len(report_names))[y_pred]
+        # Per-class metrics
+        ap_pc = average_precision_score(y_true, y_score, average=None)
+        jaccard_pc = jaccard_score(
+            y_true, y_pred, average=None, labels=labels, zero_division=0
+        )
+        # Macro/weighted metrics
+        ap_macro = average_precision_score(y_true, y_score, average="macro")
+        ap_weighted = average_precision_score(y_true, y_score, average="weighted")
+        jaccard_macro = jaccard_score(y_true, y_pred, average="macro", zero_division=0)
+        jaccard_weighted = jaccard_score(
+            y_true, y_pred, average="weighted", zero_division=0
+        )
+        # Matthews correlation coefficient (MCC)
+        mcc = matthews_corrcoef(y_true, y_pred)
+        if not isinstance(ap_pc, np.ndarray):
+            msg = "average_precision_score or f1_score did not return np.ndarray as expected."
+            self.logger.error(msg)
+            raise TypeError(msg)
+        if not isinstance(jaccard_pc, np.ndarray):
+            msg = "jaccard_score did not return np.ndarray as expected."
+            self.logger.error(msg)
+            raise TypeError(msg)
+        # Add per-class metrics
+        report_full = {}
+        dd_subset = {
+            k: v for k, v in report.items() if k in report_names and isinstance(v, dict)
+        }
+        for i, class_name in enumerate(report_names):
+            class_report: dict[str, float] = {}
+            if class_name in dd_subset:
+                class_report = dd_subset[class_name]
+            if isinstance(class_report, float) or not class_report:
+                continue
+            report_full[class_name] = dict(class_report)
+            report_full[class_name]["average-precision"] = float(ap_pc[i])
+            report_full[class_name]["jaccard"] = float(jaccard_pc[i])
+        macro_avg = report.get("macro avg")
+        if isinstance(macro_avg, dict):
+            report_full["macro avg"] = dict(macro_avg)
+            report_full["macro avg"]["average-precision"] = float(ap_macro)
+            report_full["macro avg"]["jaccard"] = float(jaccard_macro)
+        weighted_avg = report.get("weighted avg")
+        if isinstance(weighted_avg, dict):
+            report_full["weighted avg"] = dict(weighted_avg)
+            report_full["weighted avg"]["average-precision"] = float(ap_weighted)
+            report_full["weighted avg"]["jaccard"] = float(jaccard_weighted)
+        # Add scalar summary metrics
+        report_full["mcc"] = float(mcc)
+        accuracy_val = report.get("accuracy")
+        if isinstance(accuracy_val, (int, float)):
+            report_full["accuracy"] = float(accuracy_val)
+        return report_full

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl