PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/impute/unsupervised/imputers/autoencoder.py CHANGED Viewed

@@ -1,23 +1,25 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
 import copy
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
+import traceback
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union, cast
 import matplotlib.pyplot as plt
 import numpy as np
 import optuna
 import torch
-import torch.nn.functional as F
 from sklearn.exceptions import NotFittedError
-from sklearn.model_selection import train_test_split
 from snpio.analysis.genotype_encoder import GenotypeEncoder
 from snpio.utils.logging import LoggerManager
 from torch.optim.lr_scheduler import CosineAnnealingLR
 from pgsui.data_processing.config import apply_dot_overrides, load_yaml_to_dataclass
 from pgsui.data_processing.containers import AutoencoderConfig
-from pgsui.data_processing.transformers import SimMissingTransformer
 from pgsui.impute.unsupervised.base import BaseNNImputer
 from pgsui.impute.unsupervised.callbacks import EarlyStopping
-from pgsui.impute.unsupervised.loss_functions import SafeFocalCELoss
+from pgsui.impute.unsupervised.loss_functions import FocalCELoss
 from pgsui.impute.unsupervised.models.autoencoder_model import AutoencoderModel
 from pgsui.utils.logging_utils import configure_logger
 from pgsui.utils.pretty_metrics import PrettyMetrics
@@ -27,30 +29,72 @@ if TYPE_CHECKING:
     from snpio.read_input.genotype_data import GenotypeData
+def _make_warmup_cosine_scheduler(
+    optimizer: torch.optim.Optimizer,
+    *,
+    max_epochs: int,
+    warmup_epochs: int,
+    start_factor: float = 0.1,
+) -> torch.optim.lr_scheduler.CosineAnnealingLR | torch.optim.lr_scheduler.SequentialLR:
+    """Create a warmup->cosine LR scheduler.
+    Args:
+        optimizer: Optimizer to schedule.
+        max_epochs: Total number of epochs.
+        warmup_epochs: Number of warmup epochs.
+        start_factor: Starting LR factor for warmup.
+    Returns:
+        torch.optim.lr_scheduler.CosineAnnealingLR | torch.optim.lr_scheduler.SequentialLR: LR scheduler (SequentialLR if warmup_epochs > 0 else CosineAnnealingLR).
+    """
+    warmup_epochs = int(max(0, warmup_epochs))
+    if warmup_epochs == 0:
+        return CosineAnnealingLR(optimizer, T_max=max_epochs)
+    warmup = torch.optim.lr_scheduler.LinearLR(
+        optimizer, start_factor=float(start_factor), total_iters=warmup_epochs
+    )
+    cosine = CosineAnnealingLR(optimizer, T_max=max(1, max_epochs - warmup_epochs))
+    return torch.optim.lr_scheduler.SequentialLR(
+        optimizer, schedulers=[warmup, cosine], milestones=[warmup_epochs]
+    )
 def ensure_autoencoder_config(
     config: AutoencoderConfig | dict | str | None,
 ) -> AutoencoderConfig:
     """Return a concrete AutoencoderConfig from dataclass, dict, YAML path, or None.
-    This method normalizes the configuration input for the Autoencoder imputer. It accepts a structured configuration in various formats, including a dataclass instance, a nested dictionary, a YAML file path, or None. The method processes the input accordingly and returns a concrete instance of AutoencoderConfig with all necessary fields populated.
+    Notes:
+        - Supports top-level preset, or io.preset inside dict/YAML.
+        - Does not mutate user-provided dict (deep-copies before processing).
+        - Flattens nested dicts into dot-keys and applies them as overrides.
     Args:
-        config (AutoencoderConfig | dict | str | None): Structured configuration as dataclass, nested dict, YAML path, or None.
+        config: AutoencoderConfig instance, dict, YAML path, or None.
     Returns:
-        AutoencoderConfig: Concrete configuration instance.
+        Concrete AutoencoderConfig.
     """
     if config is None:
         return AutoencoderConfig()
     if isinstance(config, AutoencoderConfig):
         return config
     if isinstance(config, str):
-        # YAML path — top-level `preset` key is supported
         return load_yaml_to_dataclass(config, AutoencoderConfig)
     if isinstance(config, dict):
-        # Flatten dict into dot-keys then overlay onto a fresh instance
+        cfg_in = copy.deepcopy(config)
         base = AutoencoderConfig()
+        preset = cfg_in.pop("preset", None)
+        if "io" in cfg_in and isinstance(cfg_in["io"], dict):
+            preset = preset or cfg_in["io"].pop("preset", None)
+        if preset:
+            base = AutoencoderConfig.from_preset(preset)
         def _flatten(prefix: str, d: dict, out: dict) -> dict:
             for k, v in d.items():
                 kk = f"{prefix}.{k}" if prefix else k
@@ -60,26 +104,24 @@ def ensure_autoencoder_config(
                     out[kk] = v
             return out
-        # Lift any present preset first
-        preset_name = config.pop("preset", None)
-        if "io" in config and isinstance(config["io"], dict):
-            preset_name = preset_name or config["io"].pop("preset", None)
-        if preset_name:
-            base = AutoencoderConfig.from_preset(preset_name)
-        flat = _flatten("", config, {})
+        flat = _flatten("", cfg_in, {})
         return apply_dot_overrides(base, flat)
     raise TypeError("config must be an AutoencoderConfig, dict, YAML path, or None.")
 class ImputeAutoencoder(BaseNNImputer):
-    """Impute missing genotypes with a standard Autoencoder on 0/1/2 encodings.
+    """Autoencoder imputer for 0/1/2 genotypes.
-    This imputer uses a feedforward autoencoder architecture to learn compressed and reconstructive representations of genotype data encoded as 0 (homozygous reference), 1 (heterozygous), and 2 (homozygous alternate). Missing genotypes are represented as -1 during training and imputation.
+    Trains a feedforward autoencoder on a genotype matrix encoded as 0/1/2 with missing values represented by any negative integer. Missingness is simulated once on the full matrix, then train/val/test splits reuse those masks. It supports haploid and diploid data, focal-CE reconstruction loss (optional scheduling), and Optuna-based hyperparameter tuning. Output is returned as IUPAC strings via ``decode_012``.
-    The model is trained to minimize a focal cross-entropy loss, which helps to address class imbalance by focusing more on hard-to-classify examples. The architecture includes configurable parameters such as the number of hidden layers, latent dimension size, dropout rate, and activation functions.
+    Notes:
+        - Simulates missingness once on the full 0/1/2 matrix, then splits indices on clean ground truth.
+        - Maintains clean targets and corrupted inputs per train/val/test, plus per-split masks.
+        - Haploid harmonization happens after the single simulation (no re-simulation).
+        - Training/validation loss is computed only where targets are known (~orig_mask_*).
+        - Evaluation is computed only on simulated-missing sites (sim_mask_*).
+        - ``transform()`` fills only originally missing sites and hard-errors if decoding yields "N".
     """
     def __init__(
@@ -88,8 +130,7 @@ class ImputeAutoencoder(BaseNNImputer):
         *,
         tree_parser: Optional["TreeParser"] = None,
         config: Optional[Union["AutoencoderConfig", dict, str]] = None,
-        overrides: dict | None = None,
-        simulate_missing: bool | None = None,
+        overrides: Optional[dict] = None,
         sim_strategy: (
             Literal[
                 "random",
@@ -100,34 +141,29 @@ class ImputeAutoencoder(BaseNNImputer):
             ]
             | None
         ) = None,
-        sim_prop: float | None = None,
-        sim_kwargs: dict | None = None,
+        sim_prop: Optional[float] = None,
+        sim_kwargs: Optional[dict] = None,
     ) -> None:
         """Initialize the Autoencoder imputer with a unified config interface.
-        This initializer sets up the Autoencoder imputer by processing the provided configuration, initializing logging, and preparing the model and data encoder. It supports configuration input as a dataclass, nested dictionary, YAML file path, or None, with optional dot-key overrides for fine-tuning specific parameters.
         Args:
-            genotype_data ("GenotypeData"): Backing genotype data object.
-            tree_parser (Optional["TreeParser"]): Optional SNPio phylogenetic tree parser for population-specific modes.
-            config (Union["AutoencoderConfig", dict, str] | None): Structured configuration as dataclass, nested dict, YAML path, or None.
-            overrides (dict | None): Optional dot-key overrides with highest precedence (e.g., {'model.latent_dim': 32}).
-            simulate_missing (bool | None): Whether to simulate missing data during evaluation. If None, uses config default.
-            sim_strategy (Literal["random", "random_weighted", "random_weighted_inv", "nonrandom", "nonrandom_weighted"] | None): Strategy for simulating missing data. If None, uses config default.
-            sim_prop (float | None): Proportion of data to simulate as missing. If None, uses config default.
-            sim_kwargs (dict | None): Additional keyword arguments for simulating missing data. If None, uses config default.
+            genotype_data (GenotypeData): Backing genotype data object.
+            tree_parser (Optional[TreeParser]): Optional SNPio tree parser for nonrandom simulated-missing modes.
+            config (Optional[Union[AutoencoderConfig, dict, str]]): AutoencoderConfig, nested dict, YAML path, or None.
+            overrides (Optional[dict]): Optional dot-key overrides with highest precedence.
+            sim_strategy (Literal["random", "random_weighted" "random_weighted_inv", "nonrandom", "nonrandom_weighted"]): Override sim strategy; if None, uses config default.
+            sim_prop (Optional[float]): Override simulated missing proportion; if None, uses config default. Default is None.
+            sim_kwargs (Optional[dict]): Override/extend simulated missing kwargs; if None, uses config default.
         """
         self.model_name = "ImputeAutoencoder"
         self.genotype_data = genotype_data
         self.tree_parser = tree_parser
-        # Normalize config then apply highest-precedence overrides
         cfg = ensure_autoencoder_config(config)
         if overrides:
             cfg = apply_dot_overrides(cfg, overrides)
         self.cfg = cfg
-        # Logger consistent with NLPCA
         logman = LoggerManager(
             __name__,
             prefix=self.cfg.io.prefix,
@@ -139,8 +175,8 @@ class ImputeAutoencoder(BaseNNImputer):
             verbose=self.cfg.io.verbose,
             debug=self.cfg.io.debug,
         )
+        self.logger.propagate = False
-        # BaseNNImputer bootstrapping (device/dirs/logging handled here)
         super().__init__(
             model_name=self.model_name,
             genotype_data=self.genotype_data,
@@ -151,11 +187,9 @@ class ImputeAutoencoder(BaseNNImputer):
         )
         self.Model = AutoencoderModel
-        # Model hook & encoder
         self.pgenc = GenotypeEncoder(genotype_data)
-        # IO / global
+        # I/O and global
         self.seed = self.cfg.io.seed
         self.n_jobs = self.cfg.io.n_jobs
         self.prefix = self.cfg.io.prefix
@@ -163,264 +197,347 @@ class ImputeAutoencoder(BaseNNImputer):
         self.verbose = self.cfg.io.verbose
         self.debug = self.cfg.io.debug
         self.rng = np.random.default_rng(self.seed)
-        self.pos_weights_: torch.Tensor | None = None
-        # Simulated-missing controls (config defaults with ctor overrides)
+        # Simulation controls (match VAE pattern)
         sim_cfg = getattr(self.cfg, "sim", None)
         sim_cfg_kwargs = copy.deepcopy(getattr(sim_cfg, "sim_kwargs", None) or {})
         if sim_kwargs:
             sim_cfg_kwargs.update(sim_kwargs)
-        self.simulate_missing = (
-            (
-                sim_cfg.simulate_missing
-                if simulate_missing is None
-                else bool(simulate_missing)
-            )
-            if sim_cfg is not None
-            else bool(simulate_missing)
-        )
         if sim_cfg is None:
             default_strategy = "random"
-            default_prop = 0.10
+            default_prop = 0.2
         else:
             default_strategy = sim_cfg.sim_strategy
             default_prop = sim_cfg.sim_prop
+        self.simulate_missing = True
         self.sim_strategy = sim_strategy or default_strategy
         self.sim_prop = float(sim_prop if sim_prop is not None else default_prop)
         self.sim_kwargs = sim_cfg_kwargs
         if self.tree_parser is None and self.sim_strategy.startswith("nonrandom"):
-            msg = "tree_parser is required for nonrandom and nonrandom_weighted simulated missing strategies."
+            msg = "tree_parser is required for nonrandom sim strategies."
             self.logger.error(msg)
             raise ValueError(msg)
-        # Model hyperparams
+        # Model architecture
         self.latent_dim = int(self.cfg.model.latent_dim)
         self.dropout_rate = float(self.cfg.model.dropout_rate)
         self.num_hidden_layers = int(self.cfg.model.num_hidden_layers)
         self.layer_scaling_factor = float(self.cfg.model.layer_scaling_factor)
-        self.layer_schedule: str = str(self.cfg.model.layer_schedule)
-        self.activation = str(self.cfg.model.hidden_activation)
-        self.gamma = float(self.cfg.model.gamma)
+        self.layer_schedule = str(self.cfg.model.layer_schedule)
+        self.activation = str(self.cfg.model.activation)
+        # Training / loss controls (align with VAE fields where present)
+        self.power = float(getattr(self.cfg.train, "weights_power", 1.0))
+        self.max_ratio = getattr(self.cfg.train, "weights_max_ratio", None)
+        self.normalize = bool(getattr(self.cfg.train, "weights_normalize", True))
+        self.inverse = bool(getattr(self.cfg.train, "weights_inverse", False))
-        # Train hyperparams
         self.batch_size = int(self.cfg.train.batch_size)
         self.learning_rate = float(self.cfg.train.learning_rate)
-        self.l1_penalty: float = float(self.cfg.train.l1_penalty)
+        self.l1_penalty = float(self.cfg.train.l1_penalty)
         self.early_stop_gen = int(self.cfg.train.early_stop_gen)
         self.min_epochs = int(self.cfg.train.min_epochs)
         self.epochs = int(self.cfg.train.max_epochs)
         self.validation_split = float(self.cfg.train.validation_split)
-        self.beta = float(self.cfg.train.weights_beta)
-        self.max_ratio = float(self.cfg.train.weights_max_ratio)
-        # Tuning
-        self.tune = bool(self.cfg.tune.enabled)
-        self.tune_fast = bool(self.cfg.tune.fast)
-        self.tune_batch_size = int(self.cfg.tune.batch_size)
-        self.tune_epochs = int(self.cfg.tune.epochs)
-        self.tune_eval_interval = int(self.cfg.tune.eval_interval)
-        self.tune_metric: str = self.cfg.tune.metric
-        if self.tune_metric is not None:
-            self.tune_metric_: (
-                Literal[
-                    "pr_macro",
-                    "f1",
-                    "accuracy",
-                    "precision",
-                    "recall",
-                    "roc_auc",
-                    "average_precision",
-                ]
-                | None
-            ) = self.cfg.tune.metric
+        # Gamma can live in cfg.model or cfg.train depending on your dataclasses
+        gamma_raw = getattr(
+            self.cfg.train, "gamma", getattr(self.cfg.model, "gamma", 0.0)
+        )
+        if not isinstance(gamma_raw, (float, int)):
+            msg = f"Gamma must be float|int; got {type(gamma_raw)}."
+            self.logger.error(msg)
+            raise TypeError(msg)
+        self.gamma = float(gamma_raw)
+        self.gamma_schedule = bool(getattr(self.cfg.train, "gamma_schedule", True))
+        # Hyperparameter tuning
+        self.tune = bool(self.cfg.tune.enabled)
+        self.tune_metric = cast(
+            Literal[
+                "pr_macro",
+                "f1",
+                "accuracy",
+                "precision",
+                "recall",
+                "roc_auc",
+                "average_precision",
+                "mcc",
+                "jaccard",
+            ],
+            self.cfg.tune.metric or "f1",
+        )
         self.n_trials = int(self.cfg.tune.n_trials)
         self.tune_save_db = bool(self.cfg.tune.save_db)
         self.tune_resume = bool(self.cfg.tune.resume)
-        self.tune_max_samples = int(self.cfg.tune.max_samples)
-        self.tune_max_loci = int(self.cfg.tune.max_loci)
-        self.tune_infer_epochs = int(
-            getattr(self.cfg.tune, "infer_epochs", 0)
-        )  # AE unused
         self.tune_patience = int(self.cfg.tune.patience)
-        # Evaluate
-        # AE does not optimize latents, so these are unused / fixed
-        self.eval_latent_steps: int = 0
-        self.eval_latent_lr: float = 0.0
-        self.eval_latent_weight_decay: float = 0.0
-        # Plotting (parity with NLPCA PlotConfig)
-        self.plot_format: Literal["pdf", "png", "jpg", "jpeg", "svg"] = (
-            self.cfg.plot.fmt
-        )
+        # Plotting
+        self.plot_format = self.cfg.plot.fmt
         self.plot_dpi = int(self.cfg.plot.dpi)
         self.plot_fontsize = int(self.cfg.plot.fontsize)
         self.title_fontsize = int(self.cfg.plot.fontsize)
         self.despine = bool(self.cfg.plot.despine)
         self.show_plots = bool(self.cfg.plot.show)
-        # Core derived at fit-time
-        self.is_haploid: bool = False
-        self.num_classes_: int | None = None
+        # Fit-time attributes
+        self.is_haploid_: bool = False
+        self.num_classes_: int = 3
         self.model_params: Dict[str, Any] = {}
-        self.sim_mask_global_: np.ndarray | None = None
-        self.sim_mask_train_: np.ndarray | None = None
-        self.sim_mask_test_: np.ndarray | None = None
-    def fit(self) -> "ImputeAutoencoder":
-        """Fit the autoencoder on 0/1/2 encoded genotypes (missing -> -1).
+        self.sim_mask_train_: np.ndarray
+        self.sim_mask_val_: np.ndarray
+        self.sim_mask_test_: np.ndarray
-        This method trains the autoencoder model using the provided genotype data. It prepares the data by encoding genotypes as 0, 1, and 2, with missing values represented internally as -1. (When simulated-missing loci are generated via ``SimMissingTransformer`` they are first marked with -9 but are immediately re-encoded as -1 prior to training.) The method splits the data into training and validation sets, initializes the model and training parameters, and performs training with optional hyperparameter tuning. After training, it evaluates the model on the validation set and stores the fitted model and training history.
+        self.orig_mask_train_: np.ndarray
+        self.orig_mask_val_: np.ndarray
+        self.orig_mask_test_: np.ndarray
-        Returns:
-            ImputeAutoencoder: Fitted instance.
+    def fit(self) -> "ImputeAutoencoder":
+        """Fit the Autoencoder imputer model to the genotype data.
+        This method performs the following steps:
+            1. Validates the presence of SNP data in the genotype data.
+            2. Determines ploidy and sets up the number of classes accordingly.
+            3. Cleans the ground truth genotype matrix and simulates missingness.
+            4. Splits the data into training, validation, and test sets.
+            5. Prepares one-hot encoded inputs for the model.
+            6. Initializes plotting utilities and valid-class masks.
+            7. Sets up data loaders for training and validation.
+            8. Performs hyperparameter tuning if enabled, otherwise uses fixed hyperparameters.
+            9. Builds and trains the Autoencoder model.
+            10. Evaluates the trained model on the test set.
+            11. Returns the fitted ImputeAutoencoder instance.
-        Raises:
-            NotFittedError: If training fails.
+        Returns:
+            ImputeAutoencoder: The fitted ImputeAutoencoder instance.
         """
         self.logger.info(f"Fitting {self.model_name} model...")
-        # --- Data prep (mirror NLPCA) ---
-        X012 = self._get_float_genotypes(copy=True)
-        GT_full = np.nan_to_num(X012, nan=-1.0, copy=True)
-        self.ground_truth_ = GT_full.astype(np.int64, copy=False)
-        self.sim_mask_global_ = None
-        cache_key = self._sim_mask_cache_key()
-        if self.simulate_missing:
-            cached_mask = (
-                None if cache_key is None else self._sim_mask_cache.get(cache_key)
-            )
-            if cached_mask is not None:
-                self.sim_mask_global_ = cached_mask.copy()
-            else:
-                tr = SimMissingTransformer(
-                    genotype_data=self.genotype_data,
-                    tree_parser=self.tree_parser,
-                    prop_missing=self.sim_prop,
-                    strategy=self.sim_strategy,
-                    missing_val=-9,
-                    mask_missing=True,
-                    verbose=self.verbose,
-                    **self.sim_kwargs,
-                )
-                tr.fit(X012.copy())
-                self.sim_mask_global_ = tr.sim_missing_mask_.astype(bool)
-                if cache_key is not None:
-                    self._sim_mask_cache[cache_key] = self.sim_mask_global_.copy()
-            X_for_model = self.ground_truth_.copy()
-            X_for_model[self.sim_mask_global_] = -1
-        else:
-            X_for_model = self.ground_truth_.copy()
         if self.genotype_data.snp_data is None:
-            msg = "SNP data is required for Autoencoder imputer."
+            msg = f"SNP data is required for {self.model_name}."
             self.logger.error(msg)
-            raise TypeError(msg)
+            raise AttributeError(msg)
-        # Ploidy & classes
-        self.is_haploid = bool(
-            np.all(
-                np.isin(
-                    self.genotype_data.snp_data,
-                    ["A", "C", "G", "T", "N", "-", ".", "?"],
-                )
+        self.ploidy = self.cfg.io.ploidy
+        self.is_haploid_ = self.ploidy == 1
+        if self.ploidy > 2:
+            msg = (
+                f"{self.model_name} currently supports only haploid (1) or diploid (2) "
+                f"data; got ploidy={self.ploidy}."
             )
-        )
-        self.ploidy = 1 if self.is_haploid else 2
-        # Scoring still uses 3 labels for diploid (REF/HET/ALT); model head uses 2 logits
-        self.num_classes_ = 2 if self.is_haploid else 3
-        self.output_classes_ = 2
-        self.logger.info(
-            f"Data is {'haploid' if self.is_haploid else 'diploid'}; "
-            f"using {self.num_classes_} classes for scoring and {self.output_classes_} output channels."
-        )
+            self.logger.error(msg)
+            raise ValueError(msg)
-        if self.is_haploid:
-            self.ground_truth_[self.ground_truth_ == 2] = 1
-            X_for_model[X_for_model == 2] = 1
+        self.num_classes_ = 2 if self.is_haploid_ else 3
-        n_samples, self.num_features_ = X_for_model.shape
+        # Clean 0/1/2 ground truth (missing=-1)
+        gt_full = self.pgenc.genotypes_012.copy()
+        gt_full[gt_full < 0] = -1
+        gt_full = np.nan_to_num(gt_full, nan=-1.0)
+        self.ground_truth_ = gt_full.astype(np.int8)
+        self.num_features_ = int(self.ground_truth_.shape[1])
-        # Model params (decoder outputs L * K logits)
         self.model_params = {
             "n_features": self.num_features_,
-            "num_classes": self.output_classes_,
+            "num_classes": self.num_classes_,
             "latent_dim": self.latent_dim,
             "dropout_rate": self.dropout_rate,
             "activation": self.activation,
         }
-        # Train/Val split
-        indices = np.arange(n_samples)
-        train_idx, val_idx = train_test_split(
-            indices, test_size=self.validation_split, random_state=self.seed
+        # Simulate missingness ONCE on the full matrix
+        X_for_model_full, self.sim_mask_, self.orig_mask_ = self.sim_missing_transform(
+            self.ground_truth_
         )
-        self.train_idx_, self.test_idx_ = train_idx, val_idx
-        self.X_train_ = X_for_model[train_idx]
-        self.X_val_ = X_for_model[val_idx]
-        self.GT_train_full_ = self.ground_truth_[train_idx]
-        self.GT_test_full_ = self.ground_truth_[val_idx]
-        if self.sim_mask_global_ is not None:
-            self.sim_mask_train_ = self.sim_mask_global_[train_idx]
-            self.sim_mask_test_ = self.sim_mask_global_[val_idx]
-        else:
-            self.sim_mask_train_ = None
-            self.sim_mask_test_ = None
-        # Pos weights for diploid multilabel path (must exist before tuning)
-        if not self.is_haploid:
-            self.pos_weights_ = self._compute_pos_weights(self.X_train_)
-        else:
-            self.pos_weights_ = None
+        # Split indices based on clean ground truth
+        self.train_idx_, self.val_idx_, self.test_idx_ = self._train_val_test_split(
+            self.ground_truth_
+        )
+        # --- Clean targets per split ---
+        X_train_clean = self.ground_truth_[self.train_idx_].copy()
+        X_val_clean = self.ground_truth_[self.val_idx_].copy()
+        X_test_clean = self.ground_truth_[self.test_idx_].copy()
+        # --- Corrupted inputs per split (from the single simulation) ---
+        X_train_corrupted = X_for_model_full[self.train_idx_].copy()
+        X_val_corrupted = X_for_model_full[self.val_idx_].copy()
+        X_test_corrupted = X_for_model_full[self.test_idx_].copy()
+        # --- Masks per split ---
+        self.sim_mask_train_ = self.sim_mask_[self.train_idx_].copy()
+        self.sim_mask_val_ = self.sim_mask_[self.val_idx_].copy()
+        self.sim_mask_test_ = self.sim_mask_[self.test_idx_].copy()
+        self.orig_mask_train_ = self.orig_mask_[self.train_idx_].copy()
+        self.orig_mask_val_ = self.orig_mask_[self.val_idx_].copy()
+        self.orig_mask_test_ = self.orig_mask_[self.test_idx_].copy()
+        # Persist per-split matrices
+        self.X_train_clean_ = X_train_clean
+        self.X_val_clean_ = X_val_clean
+        self.X_test_clean_ = X_test_clean
+        self.X_train_corrupted_ = X_train_corrupted
+        self.X_val_corrupted_ = X_val_corrupted
+        self.X_test_corrupted_ = X_test_corrupted
+        # Haploid harmonization (do NOT resimulate; just recode values)
+        if self.is_haploid_:
+            def _haploidize(arr: np.ndarray) -> np.ndarray:
+                out = arr.copy()
+                miss = out < 0
+                out = np.where(out > 0, 1, out).astype(np.int8, copy=False)
+                out[miss] = -1
+                return out
+            self.X_train_clean_ = _haploidize(self.X_train_clean_)
+            self.X_val_clean_ = _haploidize(self.X_val_clean_)
+            self.X_test_clean_ = _haploidize(self.X_test_clean_)
+            self.X_train_corrupted_ = _haploidize(self.X_train_corrupted_)
+            self.X_val_corrupted_ = _haploidize(self.X_val_corrupted_)
+            self.X_test_corrupted_ = _haploidize(self.X_test_corrupted_)
+        # Convention: X_* are corrupted inputs; y_* are clean targets
+        self.X_train_ = self.X_train_corrupted_
+        self.y_train_ = self.X_train_clean_
+        self.X_val_ = self.X_val_corrupted_
+        self.y_val_ = self.X_val_clean_
+        self.X_test_ = self.X_test_corrupted_
+        self.y_test_ = self.X_test_clean_
+        # One-hot for loaders/model input
+        X_train_ohe = self._one_hot_encode_012(
+            self.X_train_, num_classes=self.num_classes_
+        )
+        X_val_ohe = self._one_hot_encode_012(self.X_val_, num_classes=self.num_classes_)
-        # Plotters/scorers (shared utilities)
+        # Plotters/scorers + valid-class mask repairs (copied from VAE flow)
         self.plotter_, self.scorers_ = self.initialize_plotting_and_scorers()
+        self.valid_class_mask_ = self._build_valid_class_mask()
+        loci = getattr(self, "valid_class_mask_conflict_loci_", None)
+        if loci is not None and loci.size:
+            self._repair_ref_alt_from_iupac(loci)
+            self.valid_class_mask_ = self._build_valid_class_mask()
+        train_loader = self._get_data_loaders(
+            X_train_ohe.detach().cpu().numpy(),
+            self.y_train_,
+            ~self.orig_mask_train_,
+            self.batch_size,
+            shuffle=True,
+        )
+        val_loader = self._get_data_loaders(
+            X_val_ohe.detach().cpu().numpy(),
+            self.y_val_,
+            ~self.orig_mask_val_,
+            self.batch_size,
+            shuffle=False,
+        )
+        self.train_loader_ = train_loader
+        self.val_loader_ = val_loader
-        # Tuning (optional; AE never needs latent refinement)
+        # Hyperparameter tuning or fixed run
         if self.tune:
-            self.tune_hyperparameters()
+            self.tuned_params_ = self.tune_hyperparameters()
+            self.model_tuned_ = True
+        else:
+            self.model_tuned_ = False
+            self.class_weights_ = self._class_weights_from_zygosity(
+                self.y_train_,
+                train_mask=self.sim_mask_train_ & ~self.orig_mask_train_,
+                inverse=self.inverse,
+                normalize=self.normalize,
+                max_ratio=self.max_ratio,
+                power=self.power,
+            )
+            self.tuned_params_ = {
+                "latent_dim": self.latent_dim,
+                "learning_rate": self.learning_rate,
+                "dropout_rate": self.dropout_rate,
+                "num_hidden_layers": self.num_hidden_layers,
+                "activation": self.activation,
+                "l1_penalty": self.l1_penalty,
+                "layer_scaling_factor": self.layer_scaling_factor,
+                "layer_schedule": self.layer_schedule,
+                "gamma": self.gamma,
+                "gamma_schedule": self.gamma_schedule,
+                "inverse": self.inverse,
+                "normalize": self.normalize,
+                "power": self.power,
+            }
+            self.tuned_params_["model_params"] = self.model_params
-        # Best params (tuned or default)
-        self.best_params_ = getattr(self, "best_params_", self._default_best_params())
+        if self.class_weights_ is not None:
+            self.logger.info(
+                f"class_weights={self.class_weights_.detach().cpu().numpy().tolist()}"
+            )
-        # Class weights (device-aware)
-        self.class_weights_ = self._normalize_class_weights(
-            self._class_weights_from_zygosity(self.X_train_)
-        )
+        # Always start clean
+        self.best_params_ = copy.deepcopy(self.tuned_params_)
-        # DataLoader
-        train_loader = self._get_data_loaders(self.X_train_)
+        # Final model params (compute hidden sizes using n_inputs=L*K, mirroring VAE)
+        input_dim = int(self.num_features_ * self.num_classes_)
+        model_params_final = {
+            "n_features": int(self.num_features_),
+            "num_classes": int(self.num_classes_),
+            "latent_dim": int(self.best_params_["latent_dim"]),
+            "dropout_rate": float(self.best_params_["dropout_rate"]),
+            "activation": str(self.best_params_["activation"]),
+        }
+        model_params_final["hidden_layer_sizes"] = self._compute_hidden_layer_sizes(
+            n_inputs=input_dim,
+            n_outputs=int(self.num_classes_),
+            n_samples=len(self.train_idx_),
+            n_hidden=int(self.best_params_["num_hidden_layers"]),
+            latent_dim=int(self.best_params_["latent_dim"]),
+            alpha=float(self.best_params_["layer_scaling_factor"]),
+            schedule=str(self.best_params_["layer_schedule"]),
+            min_size=max(16, 2 * int(self.best_params_["latent_dim"])),
+        )
+        self.best_params_["model_params"] = model_params_final
-        # Build & train
-        model = self.build_model(self.Model, self.best_params_)
+        # Build and train
+        model = self.build_model(self.Model, self.best_params_["model_params"])
         model.apply(self.initialize_weights)
+        if self.verbose or self.debug:
+            self.logger.info("Using model hyperparameters:")
+            pm = PrettyMetrics(
+                self.best_params_, precision=3, title="Model Hyperparameters"
+            )
+            pm.render()
+        lr_final = float(self.best_params_["learning_rate"])
+        l1_final = float(self.best_params_["l1_penalty"])
+        gamma_schedule = bool(
+            self.best_params_.get("gamma_schedule", self.gamma_schedule)
+        )
         loss, trained_model, history = self._train_and_validate_model(
             model=model,
-            loader=train_loader,
-            lr=self.learning_rate,
-            l1_penalty=self.l1_penalty,
-            return_history=True,
-            class_weights=self.class_weights_,
-            X_val=self.X_val_,
+            lr=lr_final,
+            l1_penalty=l1_final,
             params=self.best_params_,
-            prune_metric=self.tune_metric,
-            prune_warmup_epochs=10,
-            eval_interval=1,
-            eval_requires_latents=False,
-            eval_latent_steps=0,
-            eval_latent_lr=0.0,
-            eval_latent_weight_decay=0.0,
+            trial=None,
+            class_weights=getattr(self, "class_weights_", None),
+            gamma_schedule=gamma_schedule,
         )
         if trained_model is None:
-            msg = "Autoencoder training failed; no model was returned."
+            msg = f"{self.model_name} training failed."
             self.logger.error(msg)
             raise RuntimeError(msg)
@@ -429,217 +546,194 @@ class ImputeAutoencoder(BaseNNImputer):
             self.models_dir / f"final_model_{self.model_name}.pt",
         )
-        hist: Dict[str, List[float] | Dict[str, List[float]] | None] | None = {
-            "Train": history
-        }
-        self.best_loss_, self.model_, self.history_ = (loss, trained_model, hist)
+        if history is None:
+            hist = {"Train": []}
+        elif isinstance(history, dict):
+            hist = dict(history)
+        else:
+            hist = {"Train": list(history["Train"]), "Val": list(history["Val"])}
+        self.best_loss_ = float(loss)
+        self.model_ = trained_model
+        self.history_ = hist
         self.is_fit_ = True
-        # Evaluate on validation set (parity with NLPCA reporting)
-        eval_mask = (
-            self.sim_mask_test_
-            if (self.simulate_missing and self.sim_mask_test_ is not None)
-            else None
-        )
+        # Evaluate on simulated-missing sites only
         self._evaluate_model(
-            self.X_val_, self.model_, self.best_params_, eval_mask_override=eval_mask
+            self.model_,
+            X=self.X_test_,
+            y=self.y_test_,
+            eval_mask=self.sim_mask_test_ & ~self.orig_mask_test_,
+            objective_mode=False,
         )
-        self.plotter_.plot_history(self.history_)
+        if self.show_plots:
+            self.plotter_.plot_history(self.history_)
         self._save_best_params(self.best_params_)
+        if self.model_tuned_:
+            title = f"{self.model_name} Optimized Parameters"
+            if self.verbose or self.debug:
+                pm = PrettyMetrics(self.best_params_, precision=2, title=title)
+                pm.render()
+            # Save best parameters to a JSON file.
+            self._save_best_params(self.best_params_, objective_mode=True)
         return self
     def transform(self) -> np.ndarray:
-        """Impute missing genotypes (0/1/2) and return IUPAC strings.
+        """Impute missing genotypes and return IUPAC strings.
-        This method imputes missing genotypes in the dataset using the trained autoencoder model. It predicts the most likely genotype (0, 1, or 2) for each missing entry and fills in these values. The imputed genotypes are then decoded back to IUPAC string format for easier interpretation.
+        This method performs the following steps:
+            1. Validates that the model has been fitted.
+            2. Uses the trained model to predict missing genotypes for the entire dataset.
+            3. Fills in the missing genotypes in the original dataset with the predicted values from the model.
+            4. Decodes the imputed genotype matrix from 0/1/2 encoding to IUPAC strings.
+            5. Checks for any remaining missing values or decoding issues, raising errors if found.
+            6. Optionally generates and displays plots comparing the original and imputed genotype distributions.
+            7. Returns the imputed IUPAC genotype matrix.
         Returns:
-            np.ndarray: IUPAC strings of shape (n_samples, n_loci).
+            np.ndarray: IUPAC genotype matrix of shape (n_samples, n_loci).
         Raises:
             NotFittedError: If called before fit().
+            RuntimeError: If any missing values remain or decoding yields "N".
+            RuntimeError: If loci contain 'N' after imputation due to missing REF/ALT metadata.
         """
         if not getattr(self, "is_fit_", False):
-            raise NotFittedError("Model is not fitted. Call fit() before transform().")
+            msg = "Model is not fitted. Call fit() before transform()."
+            self.logger.error(msg)
+            raise NotFittedError(msg)
-        self.logger.info(f"Imputing entire dataset with {self.model_name}...")
+        self.logger.info(f"Imputing entire dataset with {self.model_name} model...")
         X_to_impute = self.ground_truth_.copy()
-        # Predict with masked inputs (no latent optimization)
-        pred_labels, _ = self._predict(self.model_, X=X_to_impute, return_proba=True)
+        pred_labels, _ = self._predict(self.model_, X=X_to_impute)
-        # Fill only missing
-        missing_mask = X_to_impute == -1
+        missing_mask = X_to_impute < 0
         imputed_array = X_to_impute.copy()
         imputed_array[missing_mask] = pred_labels[missing_mask]
-        # Decode to IUPAC & optionally plot
-        imputed_genotypes = self.pgenc.decode_012(imputed_array)
+        if np.any(imputed_array < 0):
+            msg = f"[{self.model_name}] Some missing genotypes remain after imputation. This is unexpected."
+            self.logger.error(msg)
+            raise RuntimeError(msg)
+        decode_input = imputed_array
+        if self.is_haploid_:
+            decode_input = imputed_array.copy()
+            decode_input[decode_input == 1] = 2
+        imputed_genotypes = self.decode_012(decode_input)
+        bad_loci = np.where((imputed_genotypes == "N").any(axis=0))[0]
+        if bad_loci.size > 0:
+            msg = f"[{self.model_name}] {bad_loci.size} loci contain 'N' after imputation (e.g., first 10 indices: {bad_loci[:10].tolist()}). This occurs when REF/ALT metadata is missing and cannot be inferred from the source data (e.g., loci with 100 percent missing genotypes). Try filtering out these loci before imputation."
+            self.logger.error(msg)
+            self.logger.debug(
+                "All loci with 'N': " + ", ".join(map(str, bad_loci.tolist()))
+            )
+            raise RuntimeError(msg)
         if self.show_plots:
-            original_genotypes = self.pgenc.decode_012(X_to_impute)
+            original_input = X_to_impute
+            if self.is_haploid_:
+                original_input = X_to_impute.copy()
+                original_input[original_input == 1] = 2
+            original_genotypes = self.decode_012(original_input)
             plt.rcParams.update(self.plotter_.param_dict)
             self.plotter_.plot_gt_distribution(original_genotypes, is_imputed=False)
             self.plotter_.plot_gt_distribution(imputed_genotypes, is_imputed=True)
         return imputed_genotypes
-    def _get_data_loaders(self, y: np.ndarray) -> torch.utils.data.DataLoader:
-        """Create DataLoader over indices + integer targets (-1 for missing).
-        This method creates a PyTorch DataLoader that yields batches of indices and their corresponding genotype targets encoded as integers (0, 1, 2) with -1 indicating missing values. The DataLoader is shuffled to ensure random sampling during training.
-        Args:
-            y (np.ndarray): 0/1/2 matrix with -1 for missing.
-        Returns:
-            torch.utils.data.DataLoader: Shuffled DataLoader.
-        """
-        y_tensor = torch.from_numpy(y).long()
-        indices = torch.arange(len(y), dtype=torch.long)
-        dataset = torch.utils.data.TensorDataset(indices, y_tensor)
-        pin_memory = self.device.type == "cuda"
-        return torch.utils.data.DataLoader(
-            dataset,
-            batch_size=self.batch_size,
-            shuffle=True,
-            pin_memory=pin_memory,
-        )
     def _train_and_validate_model(
         self,
         model: torch.nn.Module,
-        loader: torch.utils.data.DataLoader,
+        *,
         lr: float,
         l1_penalty: float,
-        trial: optuna.Trial | None = None,
-        return_history: bool = False,
-        class_weights: torch.Tensor | None = None,
-        *,
-        X_val: np.ndarray | None = None,
-        params: dict | None = None,
-        prune_metric: str = "f1",  # "f1" | "accuracy" | "pr_macro"
-        prune_warmup_epochs: int = 10,
-        eval_interval: int = 1,
-        # Evaluation parameters (AE ignores latent refinement knobs)
-        eval_requires_latents: bool = False,  # AE: always False
-        eval_latent_steps: int = 0,
-        eval_latent_lr: float = 0.0,
-        eval_latent_weight_decay: float = 0.0,
-    ) -> Tuple[float, torch.nn.Module | None, list | None]:
-        """Wrap the AE training loop (no latent optimizer), with Optuna pruning.
-        This method orchestrates the training of the autoencoder model using the provided DataLoader. It sets up the optimizer and learning rate scheduler, and executes the training loop with support for early stopping and Optuna pruning based on validation performance. The method returns the best validation loss, the best model state, and optionally the training history.
+        trial: Optional[optuna.Trial] = None,
+        params: Optional[dict[str, Any]] = None,
+        class_weights: Optional[torch.Tensor] = None,
+        gamma_schedule: bool = False,
+    ) -> tuple[float, torch.nn.Module, dict[str, list[float]]]:
+        """Train and validate the model.
+        This method sets up the optimizer and learning rate scheduler, then executes the training loop with early stopping and optional hyperparameter tuning via Optuna. It returns the best validation loss, the best model, and the training history.
         Args:
             model (torch.nn.Module): Autoencoder model.
-            loader (torch.utils.data.DataLoader): Batches (indices, y_int) where y_int is 0/1/2; -1 for missing.
             lr (float): Learning rate.
-            l1_penalty (float): L1 regularization coeff.
-            trial (optuna.Trial | None): Optuna trial for pruning (optional).
-            return_history (bool): If True, return train loss history.
-            class_weights (torch.Tensor | None): Class weights tensor (on device).
-            X_val (np.ndarray | None): Validation matrix (0/1/2 with -1 for missing).
-            params (dict | None): Model params for evaluation.
-            prune_metric (str): Metric for pruning reports.
-            prune_warmup_epochs (int): Pruning warmup epochs.
-            eval_interval (int): Eval frequency (epochs).
-            eval_requires_latents (bool): Ignored for AE (no latent inference).
-            eval_latent_steps (int): Unused for AE.
-            eval_latent_lr (float): Unused for AE.
-            eval_latent_weight_decay (float): Unused for AE.
+            l1_penalty (float): L1 regularization coefficient.
+            trial (Optional[optuna.Trial]): Optuna trial (optional).
+            params (Optional[dict[str, Any]]): Hyperparams dict (optional).
+            class_weights (Optional[torch.Tensor]): Class weights for focal CE (optional).
+            gamma_schedule (bool): Whether to schedule gamma.
         Returns:
-            Tuple[float, torch.nn.Module | None, list | None]: (best_loss, best_model, history or None).
+            tuple[float, torch.nn.Module, dict[str, list[float]]]: Best validation loss, best model, history.
         """
-        if class_weights is None:
-            msg = "Must provide class_weights."
-            self.logger.error(msg)
-            raise TypeError(msg)
+        max_epochs = int(self.epochs)
+        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
-        # Epoch budget mirrors NLPCA config (tuning vs final)
-        max_epochs = (
-            self.tune_epochs if (trial is not None and self.tune_fast) else self.epochs
+        scheduler = _make_warmup_cosine_scheduler(
+            optimizer, max_epochs=max_epochs, warmup_epochs=int(0.1 * max_epochs)
         )
-        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
-        scheduler = CosineAnnealingLR(optimizer, T_max=max_epochs)
         best_loss, best_model, hist = self._execute_training_loop(
-            loader=loader,
             optimizer=optimizer,
             scheduler=scheduler,
             model=model,
             l1_penalty=l1_penalty,
             trial=trial,
-            return_history=return_history,
-            class_weights=class_weights,
-            X_val=X_val,
             params=params,
-            prune_metric=prune_metric,
-            prune_warmup_epochs=prune_warmup_epochs,
-            eval_interval=eval_interval,
-            eval_requires_latents=False,  # AE: no latent inference
-            eval_latent_steps=0,
-            eval_latent_lr=0.0,
-            eval_latent_weight_decay=0.0,
+            class_weights=class_weights,
+            gamma_schedule=gamma_schedule,
         )
-        if return_history:
-            return best_loss, best_model, hist
-        return best_loss, best_model, None
+        return best_loss, best_model, hist
     def _execute_training_loop(
         self,
-        loader: torch.utils.data.DataLoader,
+        *,
         optimizer: torch.optim.Optimizer,
-        scheduler: CosineAnnealingLR,
+        scheduler: (
+            torch.optim.lr_scheduler.CosineAnnealingLR
+            | torch.optim.lr_scheduler.SequentialLR
+        ),
         model: torch.nn.Module,
         l1_penalty: float,
-        trial: optuna.Trial | None,
-        return_history: bool,
-        class_weights: torch.Tensor,
-        *,
-        X_val: np.ndarray | None = None,
-        params: dict | None = None,
-        prune_metric: str = "f1",
-        prune_warmup_epochs: int = 10,
-        eval_interval: int = 1,
-        # Evaluation parameters (AE ignores latent refinement knobs)
-        eval_requires_latents: bool = False,  # AE: False
-        eval_latent_steps: int = 0,
-        eval_latent_lr: float = 0.0,
-        eval_latent_weight_decay: float = 0.0,
-    ) -> Tuple[float, torch.nn.Module, list]:
-        """Train AE with focal CE (gamma warm/ramp) + early stopping & pruning.
-        This method executes the training loop for the autoencoder model, performing one epoch at a time. It computes the focal cross-entropy loss while ignoring masked (missing) values and applies L1 regularization if specified. The method incorporates early stopping based on validation performance and supports Optuna pruning to terminate unpromising trials early. It returns the best validation loss, the best model state, and optionally the training history.
+        trial: Optional[optuna.Trial] = None,
+        params: Optional[dict[str, Any]] = None,
+        class_weights: Optional[torch.Tensor] = None,
+        gamma_schedule: bool = False,
+    ) -> tuple[float, torch.nn.Module, dict[str, list[float]]]:
+        """Train AE (masked focal CE) with EarlyStopping + Optuna pruning.
         Args:
-            loader (torch.utils.data.DataLoader): Batches (indices, y_int) where y_int is 0/1/2; -1 for missing.
-            optimizer (torch.optim.Optimizer): Optimizer.
-            scheduler (torch.optim.lr_scheduler._LRScheduler): LR scheduler.
+            optimizer (torch.optim.Optimizer): Optimizer for training.
+            scheduler (torch.optim.lr_scheduler.CosineAnnealingLR | torch.optim.lr_scheduler.SequentialLR): LR scheduler.
             model (torch.nn.Module): Autoencoder model.
-            l1_penalty (float): L1 regularization coeff.
-            trial (optuna.Trial | None): Optuna trial for pruning (optional).
-            return_history (bool): If True, return train loss history.
-            class_weights (torch.Tensor): Class weights tensor (on device).
-            X_val (np.ndarray | None): Validation matrix (0/1/2 with -1 for missing).
-            params (dict | None): Model params for evaluation.
-            prune_metric (str): Metric for pruning reports.
-            prune_warmup_epochs (int): Pruning warmup epochs.
-            eval_interval (int): Eval frequency (epochs).
-            eval_requires_latents (bool): Ignored for AE (no latent inference).
-            eval_latent_steps (int): Unused for AE.
-            eval_latent_lr (float): Unused for AE.
-            eval_latent_weight_decay (float): Unused for AE.
+            l1_penalty (float): L1 regularization coefficient.
+            trial (Optional[optuna.Trial]): Optuna trial (optional).
+            params (Optional[dict[str, Any]]): Hyperparams dict (optional).
+            class_weights (Optional[torch.Tensor]): Class weights for focal CE (optional).
+            gamma_schedule (bool): Whether to schedule gamma.
         Returns:
-            Tuple[float, torch.nn.Module, list]: Best validation loss, best model, and training history.
+            tuple[float, torch.nn.Module, dict[str, list[float]]]: Best loss, best model, and training history.
+        Notes:
+            - Computes loss only where targets are known (~orig_mask_*).
+            - Evaluates metrics only on simulated-missing sites (sim_mask_*).
         """
-        best_loss = float("inf")
-        best_model = None
-        history: list[float] = []
+        history: dict[str, list[float]] = defaultdict(list)
         early_stopping = EarlyStopping(
             patience=self.early_stop_gen,
@@ -649,157 +743,157 @@ class ImputeAutoencoder(BaseNNImputer):
             debug=self.debug,
         )
-        gamma_val = self.gamma
-        if isinstance(gamma_val, (list, tuple)):
-            if len(gamma_val) == 0:
-                raise ValueError("gamma list is empty.")
-            gamma_val = gamma_val[0]
-        gamma_final = float(gamma_val)
-        gamma_warm, gamma_ramp = 50, 100
-        # Optional LR warmup
-        warmup_epochs = int(getattr(self, "lr_warmup_epochs", 5))
-        base_lr = float(optimizer.param_groups[0]["lr"])
-        min_lr = base_lr * 0.1
-        max_epochs = int(getattr(scheduler, "T_max", getattr(self, "epochs", 100)))
-        for epoch in range(max_epochs):
-            # focal γ schedule (for stable training)
-            if epoch < gamma_warm:
-                model.gamma = 0.0  # type: ignore
-            elif epoch < gamma_warm + gamma_ramp:
-                model.gamma = gamma_final * ((epoch - gamma_warm) / gamma_ramp)  # type: ignore
+        gamma_target, gamma_warm, gamma_ramp = self._anneal_config(
+            params, "gamma", default=self.gamma, max_epochs=self.epochs
+        )
+        gamma_target = float(gamma_target)
+        cw = class_weights
+        if cw is not None and cw.device != self.device:
+            cw = cw.to(self.device)
+        for epoch in range(int(self.epochs)):
+            if gamma_schedule:
+                gamma_current = self._update_anneal_schedule(
+                    gamma_target,
+                    warm=gamma_warm,
+                    ramp=gamma_ramp,
+                    epoch=epoch,
+                    init_val=0.0,
+                )
+                gamma_val = float(gamma_current)
             else:
-                model.gamma = gamma_final  # type: ignore
+                gamma_val = gamma_target
-            # LR warmup
-            if epoch < warmup_epochs:
-                scale = float(epoch + 1) / warmup_epochs
-                for g in optimizer.param_groups:
-                    g["lr"] = min_lr + (base_lr - min_lr) * scale
+            ce_criterion = FocalCELoss(
+                alpha=cw, gamma=gamma_val, ignore_index=-1, reduction="mean"
+            )
             train_loss = self._train_step(
-                loader=loader,
+                loader=self.train_loader_,
                 optimizer=optimizer,
                 model=model,
+                ce_criterion=ce_criterion,
                 l1_penalty=l1_penalty,
-                class_weights=class_weights,
             )
-            # Abort or prune on non-finite epoch loss
             if not np.isfinite(train_loss):
                 if trial is not None:
-                    raise optuna.exceptions.TrialPruned("Epoch loss non-finite.")
-                # Soft reset suggestion: reduce LR and continue, or break
-                self.logger.warning(
-                    "Non-finite epoch loss. Reducing LR by 10 percent and continuing."
-                )
-                for g in optimizer.param_groups:
-                    g["lr"] *= 0.9
-                continue
+                    msg = f"[{self.model_name}] Trial {trial.number} training loss non-finite."
+                    self.logger.warning(msg)
+                    raise optuna.exceptions.TrialPruned(msg)
+                msg = f"[{self.model_name}] Training loss is non-finite at epoch {epoch + 1}."
+                self.logger.error(msg)
+                raise RuntimeError(msg)
+            val_loss = self._val_step(
+                loader=self.val_loader_,
+                model=model,
+                ce_criterion=ce_criterion,
+                l1_penalty=l1_penalty,
+            )
             scheduler.step()
-            if return_history:
-                history.append(train_loss)
+            history["Train"].append(float(train_loss))
+            history["Val"].append(float(val_loss))
-            early_stopping(train_loss, model)
+            early_stopping(val_loss, model)
             if early_stopping.early_stop:
-                self.logger.info(f"Early stopping at epoch {epoch + 1}.")
+                self.logger.debug(
+                    f"[{self.model_name}] Early stopping at epoch {epoch + 1}."
+                )
                 break
-            # Optuna report/prune on validation metric
-            if (
-                trial is not None
-                and X_val is not None
-                and ((epoch + 1) % eval_interval == 0)
-            ):
-                metric_key = prune_metric or getattr(self, "tune_metric", "f1")
-                mask_override = None
-                if (
-                    self.simulate_missing
-                    and getattr(self, "sim_mask_test_", None) is not None
-                    and getattr(self, "X_val_", None) is not None
-                    and X_val.shape == self.X_val_.shape
-                ):
-                    mask_override = self.sim_mask_test_
-                metric_val = self._eval_for_pruning(
+            if trial is not None:
+                metric_vals = self._evaluate_model(
                     model=model,
-                    X_val=X_val,
-                    params=params or getattr(self, "best_params_", {}),
-                    metric=metric_key,
+                    X=self.X_val_,
+                    y=self.y_val_,
+                    eval_mask=self.sim_mask_val_ & ~self.orig_mask_val_,
                     objective_mode=True,
-                    do_latent_infer=False,  # AE: False
-                    latent_steps=0,
-                    latent_lr=0.0,
-                    latent_weight_decay=0.0,
-                    latent_seed=self.seed,  # type: ignore
-                    _latent_cache=None,  # AE: not used
-                    _latent_cache_key=None,
-                    eval_mask_override=mask_override,
                 )
-                trial.report(metric_val, step=epoch + 1)
-                if (epoch + 1) >= prune_warmup_epochs and trial.should_prune():
+                trial.report(metric_vals[self.tune_metric], step=epoch + 1)
+                if trial.should_prune():
                     raise optuna.exceptions.TrialPruned(
-                        f"Pruned at epoch {epoch + 1}: {metric_key}={metric_val:.5f}"
+                        f"[{self.model_name}] Trial {trial.number} pruned at epoch {epoch + 1}."
                     )
-        best_loss = early_stopping.best_score
-        if early_stopping.best_model is not None:
-            best_model = copy.deepcopy(early_stopping.best_model)
-        else:
-            best_model = copy.deepcopy(model)
-        return best_loss, best_model, history
+        best_loss = float(early_stopping.best_score)
+        if early_stopping.best_state_dict is not None:
+            model.load_state_dict(early_stopping.best_state_dict)
+        return best_loss, model, dict(history)
     def _train_step(
         self,
         loader: torch.utils.data.DataLoader,
         optimizer: torch.optim.Optimizer,
         model: torch.nn.Module,
+        ce_criterion: torch.nn.Module,
+        *,
         l1_penalty: float,
-        class_weights: torch.Tensor,
     ) -> float:
-        """One epoch with stable focal CE and NaN/Inf guards."""
+        """Single epoch train step (masked focal CE + optional L1).
+        Args:
+            loader (torch.utils.data.DataLoader): Training data loader.
+            optimizer (torch.optim.Optimizer): Optimizer for training.
+            model (torch.nn.Module): Autoencoder model.
+            ce_criterion (torch.nn.Module): Cross-entropy loss function.
+            l1_penalty (float): L1 regularization coefficient.
+        Returns:
+            float: Average training loss over the epoch.
+        Notes:
+            Expects loader batches as (X_ohe, y_int, mask_bool) where:
+                - X_ohe: (B, L, C) float/compatible
+                - y_int: (B, L) int, with -1 for unknown targets
+                - mask_bool: (B, L) bool selecting which positions contribute to loss
+        """
         model.train()
         running = 0.0
         num_batches = 0
+        nF_model = int(getattr(model, "n_features", self.num_features_))
+        nC_model = int(getattr(model, "num_classes", self.num_classes_))
         l1_params = tuple(p for p in model.parameters() if p.requires_grad)
-        if class_weights is not None and class_weights.device != self.device:
-            class_weights = class_weights.to(self.device)
-        # Use model.gamma if present, else self.gamma
-        gamma = float(getattr(model, "gamma", getattr(self, "gamma", 0.0)))
-        gamma = float(torch.tensor(gamma).clamp(min=0.0, max=10.0))  # sane bound
-        ce_criterion = SafeFocalCELoss(
-            gamma=gamma, weight=class_weights, ignore_index=-1
-        )
-        for _, y_batch in loader:
+        for X_batch, y_batch, m_batch in loader:
             optimizer.zero_grad(set_to_none=True)
-            y_batch = y_batch.to(self.device, non_blocking=True)
-            # Inputs: one-hot with zeros for missing; Targets: long ints with -1 for missing
-            if self.is_haploid:
-                x_in = self._one_hot_encode_012(y_batch)  # (B, L, 2)
-                logits = model(x_in).view(-1, self.num_features_, self.output_classes_)
-                logits_flat = logits.view(-1, self.output_classes_)
-                targets_flat = y_batch.view(-1).long()
-                if not torch.isfinite(logits_flat).all():
-                    continue
-                loss = ce_criterion(logits_flat, targets_flat)
-            else:
-                x_in = self._encode_multilabel_inputs(y_batch)  # (B, L, 2)
-                logits = model(x_in).view(-1, self.num_features_, self.output_classes_)
-                if not torch.isfinite(logits).all():
-                    continue
-                pos_w = getattr(self, "pos_weights_", None)
-                targets = self._multi_hot_targets(y_batch)  # float, same shape
-                bce = F.binary_cross_entropy_with_logits(
-                    logits, targets, pos_weight=pos_w, reduction="none"
+            X_batch = X_batch.to(self.device, non_blocking=True).float()
+            y_batch = y_batch.to(self.device, non_blocking=True).long()
+            m_batch = m_batch.to(self.device, non_blocking=True).bool()
+            if (
+                X_batch.dim() != 3
+                or X_batch.shape[1] != nF_model
+                or X_batch.shape[2] != nC_model
+            ):
+                msg = (
+                    f"Train batch X shape mismatch: expected (B,{nF_model},{nC_model}), "
+                    f"got {tuple(X_batch.shape)}."
                 )
-                mask = (y_batch != -1).unsqueeze(-1).float()
-                loss = (bce * mask).sum() / mask.sum().clamp_min(1e-8)
+                self.logger.error(msg)
+                raise ValueError(msg)
+            logits_flat = model(X_batch)
+            expected = (X_batch.shape[0], nF_model * nC_model)
+            if logits_flat.dim() != 2 or tuple(logits_flat.shape) != expected:
+                try:
+                    logits_flat = logits_flat.view(*expected)
+                except Exception as e:
+                    msg = f"Model logits expected shape {expected}, got {tuple(logits_flat.shape)}."
+                    self.logger.error(msg)
+                    raise ValueError(msg) from e
+            logits = logits_flat.view(-1, nF_model, nC_model)
+            logits_masked = logits.view(-1, nC_model)[m_batch.view(-1)]
+            targets_masked = y_batch.view(-1)
+            targets_masked = targets_masked[m_batch.view(-1)]
+            loss = ce_criterion(logits_masked, targets_masked)
             if l1_penalty > 0:
                 l1 = torch.zeros((), device=self.device)
@@ -807,247 +901,234 @@ class ImputeAutoencoder(BaseNNImputer):
                     l1 = l1 + p.abs().sum()
                 loss = loss + l1_penalty * l1
-            # Final guard
             if not torch.isfinite(loss):
                 continue
             loss.backward()
-            # Clip to prevent exploding grads
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-            # If grads blew up to non-finite, skip update
-            if any(
-                (not torch.isfinite(p.grad).all())
-                for p in model.parameters()
-                if p.grad is not None
-            ):
-                optimizer.zero_grad(set_to_none=True)
-                continue
             optimizer.step()
             running += float(loss.detach().item())
             num_batches += 1
-        if num_batches == 0:
-            return float("inf")  # signal upstream that epoch had no usable batches
-        return running / num_batches
+        return float("inf") if num_batches == 0 else running / num_batches
+    def _val_step(
+        self,
+        loader: torch.utils.data.DataLoader,
+        model: torch.nn.Module,
+        ce_criterion: torch.nn.Module,
+        *,
+        l1_penalty: float,
+    ) -> float:
+        """Validation step (masked focal CE + optional L1).
+        Args:
+            loader (torch.utils.data.DataLoader): Validation data loader.
+            model (torch.nn.Module): Autoencoder model.
+            ce_criterion (torch.nn.Module): Cross-entropy loss function.
+            l1_penalty (float): L1 regularization coefficient.
+        Returns:
+            float: Average validation loss over the epoch.
+        """
+        model.eval()
+        running = 0.0
+        num_batches = 0
+        nF_model = self.num_features_
+        nC_model = self.num_classes_
+        l1_params = tuple(p for p in model.parameters() if p.requires_grad)
+        with torch.no_grad():
+            for X_batch, y_batch, m_batch in loader:
+                X_batch = X_batch.to(self.device, non_blocking=True).float()
+                y_batch = y_batch.to(self.device, non_blocking=True).long()
+                m_batch = m_batch.to(self.device, non_blocking=True).bool()
+                logits_flat = model(X_batch)
+                expected = (X_batch.shape[0], nF_model * nC_model)
+                if logits_flat.dim() != 2 or tuple(logits_flat.shape) != expected:
+                    try:
+                        logits_flat = logits_flat.view(*expected)
+                    except Exception as e:
+                        msg = f"Model logits expected shape {expected}, got {tuple(logits_flat.shape)}."
+                        self.logger.error(msg)
+                        raise ValueError(msg) from e
+                logits = logits_flat.view(-1, nF_model, nC_model)
+                logits_masked = logits.view(-1, nC_model)[m_batch.view(-1)]
+                targets_masked = y_batch.view(-1)[m_batch.view(-1)]
+                if targets_masked.numel() == 0:
+                    continue
+                loss = ce_criterion(logits_masked, targets_masked)
+                if l1_penalty > 0:
+                    l1 = torch.zeros((), device=self.device)
+                    for p in l1_params:
+                        l1 = l1 + p.abs().sum()
+                    loss = loss + l1_penalty * l1
+                if not torch.isfinite(loss):
+                    continue
+                running += float(loss.item())
+                num_batches += 1
+        return float("inf") if num_batches == 0 else running / num_batches
     def _predict(
         self,
         model: torch.nn.Module,
         X: np.ndarray | torch.Tensor,
+        *,
         return_proba: bool = False,
-    ) -> Tuple[np.ndarray, np.ndarray] | np.ndarray:
-        """Predict 0/1/2 labels (and probabilities) from masked inputs.
-        This method generates predictions from the trained autoencoder model for the provided input data. It processes the input data, performs a forward pass through the model, and computes the predicted genotype labels (0, 1, or 2) along with their associated probabilities if requested.
+    ) -> tuple[np.ndarray, np.ndarray | None]:
+        """Predict categorical genotype labels from logits.
         Args:
             model (torch.nn.Module): Trained model.
-            X (np.ndarray | torch.Tensor): 0/1/2 matrix with -1
-                for missing.
-            return_proba (bool): If True, return probabilities.
+            X (np.ndarray | torch.Tensor): 2D 0/1/2 matrix with -1 for missing, or 3D one-hot (B, L, K).
+            return_proba (bool): If True, return probabilities (B, L, K).
         Returns:
-            Tuple[np.ndarray, np.ndarray] | np.ndarray: Predicted labels,
-                and probabilities if requested.
+            tuple[np.ndarray, np.ndarray | None]: Predicted labels and optionally probabilities.
         """
         if model is None:
             msg = "Model is not trained. Call fit() before predict()."
             self.logger.error(msg)
             raise NotFittedError(msg)
+        nF = self.num_features_
+        nC = self.num_classes_
+        if isinstance(X, torch.Tensor):
+            X_tensor = X
+        else:
+            X_tensor = torch.from_numpy(X)
+        X_tensor = X_tensor.float()
+        if X_tensor.device != self.device:
+            X_tensor = X_tensor.to(self.device)
+        if X_tensor.dim() == 2:
+            # 0/1/2 matrix -> one-hot for model input
+            X_tensor = self._one_hot_encode_012(X_tensor, num_classes=nC)
+            X_tensor = X_tensor.float()
+            if X_tensor.device != self.device:
+                X_tensor = X_tensor.to(self.device)
+        elif X_tensor.dim() != 3:
+            msg = f"_predict expects 2D 0/1/2 inputs or 3D one-hot inputs; got shape {tuple(X_tensor.shape)}."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        if X_tensor.shape[1] != nF or X_tensor.shape[2] != nC:
+            msg = f"_predict input shape mismatch: expected (B, {nF}, {nC}), got {tuple(X_tensor.shape)}."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        X_tensor = X_tensor.reshape(X_tensor.shape[0], nF * nC)
         model.eval()
         with torch.no_grad():
-            X_tensor = torch.from_numpy(X) if isinstance(X, np.ndarray) else X
-            X_tensor = X_tensor.to(self.device).long()
-            if self.is_haploid:
-                x_ohe = self._one_hot_encode_012(X_tensor)
-                logits = model(x_ohe).view(-1, self.num_features_, self.output_classes_)
-                probas = torch.softmax(logits, dim=-1)
-                labels = torch.argmax(probas, dim=-1)
-            else:
-                x_in = self._encode_multilabel_inputs(X_tensor)
-                logits = model(x_in).view(-1, self.num_features_, self.output_classes_)
-                probas_2 = torch.sigmoid(logits)
-                p_ref = probas_2[..., 0]
-                p_alt = probas_2[..., 1]
-                p_het = p_ref * p_alt
-                p_ref_only = p_ref * (1 - p_alt)
-                p_alt_only = p_alt * (1 - p_ref)
-                stacked = torch.stack([p_ref_only, p_het, p_alt_only], dim=-1)
-                stacked = stacked / stacked.sum(dim=-1, keepdim=True).clamp_min(1e-8)
-                probas = stacked
-                labels = torch.argmax(stacked, dim=-1)
+            logits_flat = model(X_tensor)
+            logits = logits_flat.view(-1, nF, nC)
+            probas = torch.softmax(logits, dim=-1)
+            labels = torch.argmax(probas, dim=-1)
         if return_proba:
             return labels.cpu().numpy(), probas.cpu().numpy()
-        return labels.cpu().numpy()
-    def _encode_multilabel_inputs(self, y: torch.Tensor) -> torch.Tensor:
-        """Two-channel multi-hot for diploid: REF-only, ALT-only; HET sets both."""
-        if self.is_haploid:
-            return self._one_hot_encode_012(y)
-        y = y.to(self.device)
-        shape = y.shape + (2,)
-        out = torch.zeros(shape, device=self.device, dtype=torch.float32)
-        valid = y != -1
-        ref_mask = valid & (y != 2)
-        alt_mask = valid & (y != 0)
-        out[ref_mask, 0] = 1.0
-        out[alt_mask, 1] = 1.0
-        return out
-    def _multi_hot_targets(self, y: torch.Tensor) -> torch.Tensor:
-        """Targets aligned with _encode_multilabel_inputs for diploid training."""
-        if self.is_haploid:
-            # One-hot CE path expects integer targets; handled upstream.
-            raise RuntimeError("_multi_hot_targets called for haploid data.")
-        y = y.to(self.device)
-        out = torch.zeros(y.shape + (2,), device=self.device, dtype=torch.float32)
-        valid = y != -1
-        ref_mask = valid & (y != 2)
-        alt_mask = valid & (y != 0)
-        out[ref_mask, 0] = 1.0
-        out[alt_mask, 1] = 1.0
-        return out
-    def _compute_pos_weights(self, X: np.ndarray) -> torch.Tensor:
-        """Balance REF/ALT channels for multilabel BCE."""
-        ref_pos = np.count_nonzero((X == 0) | (X == 1))
-        alt_pos = np.count_nonzero((X == 2) | (X == 1))
-        total_valid = np.count_nonzero(X != -1)
-        pos_counts = np.array([ref_pos, alt_pos], dtype=np.float32)
-        neg_counts = np.maximum(total_valid - pos_counts, 1.0)
-        pos_counts = np.maximum(pos_counts, 1.0)
-        weights = neg_counts / pos_counts
-        return torch.tensor(weights, device=self.device, dtype=torch.float32)
+        return labels.cpu().numpy(), None
     def _evaluate_model(
         self,
-        X_val: np.ndarray,
         model: torch.nn.Module,
-        params: dict,
-        objective_mode: bool = False,
-        latent_vectors_val: Optional[np.ndarray] = None,
+        X: np.ndarray,
+        y: np.ndarray,
+        eval_mask: np.ndarray,
         *,
-        eval_mask_override: np.ndarray | None = None,
+        objective_mode: bool = False,
     ) -> Dict[str, float]:
         """Evaluate on 0/1/2; then IUPAC decoding and 10-base integer reports.
-        This method evaluates the trained autoencoder model on a validation set, computing various classification metrics based on the predicted and true genotypes. It handles both haploid and diploid data appropriately and generates detailed classification reports for both genotype and IUPAC/10-base integer encodings.
         Args:
-            X_val (np.ndarray): Validation set 0/1/2 matrix with -1
-                for missing.
             model (torch.nn.Module): Trained model.
-            params (dict): Model parameters.
-            objective_mode (bool): If True, suppress logging and reports.
-            latent_vectors_val (Optional[np.ndarray]): Unused for AE.
-            eval_mask_override (np.ndarray | None): Optional mask to override default evaluation mask.
+            X (np.ndarray): 2D 0/1/2 matrix with -1 for missing.
+            y (np.ndarray): 2D 0/1/2 ground truth matrix with -1 for missing.
+            eval_mask (np.ndarray): 2D boolean mask selecting sites to evaluate.
+            objective_mode (bool): If True, suppress detailed reports and plots.
         Returns:
             Dict[str, float]: Dictionary of evaluation metrics.
         """
-        pred_labels, pred_probas = self._predict(
-            model=model, X=X_val, return_proba=True
-        )
+        if model is None:
+            msg = "Model passed to _evaluate_model() is not fitted. Call fit() before evaluation."
+            self.logger.error(msg)
+            raise NotFittedError(msg)
-        finite_mask = np.all(np.isfinite(pred_probas), axis=-1)  # (N, L)
-        # FIX 1: Check ROWS (shape[0]) only. X_val might be a feature subset.
-        if (
-            hasattr(self, "X_val_")
-            and getattr(self, "X_val_", None) is not None
-            and X_val.shape[0] == self.X_val_.shape[0]
-        ):
-            GT_ref = getattr(self, "GT_test_full_", self.ground_truth_)
-        elif (
-            hasattr(self, "X_train_")
-            and getattr(self, "X_train_", None) is not None
-            and X_val.shape[0] == self.X_train_.shape[0]
-        ):
-            GT_ref = getattr(self, "GT_train_full_", self.ground_truth_)
-        else:
-            GT_ref = self.ground_truth_
-        # FIX 2: Handle Feature Mismatch (e.g., tune_fast feature subsetting)
-        # If the GT source has more columns than X_val, slice it to match.
-        if GT_ref.shape[1] > X_val.shape[1]:
-            GT_ref = GT_ref[:, : X_val.shape[1]]
-        # Fallback if rows mismatch (unlikely after Fix 1, but safe to keep)
-        if GT_ref.shape != X_val.shape:
-            # If completely different, we can't use the ground truth object.
-            # Fall back to X_val (this implies only observed values are scored)
-            GT_ref = X_val
-        if eval_mask_override is not None:
-            # FIX 3: Allow override mask to be sliced if it's too wide
-            if eval_mask_override.shape[0] != X_val.shape[0]:
-                msg = (
-                    f"eval_mask_override rows {eval_mask_override.shape[0]} "
-                    f"does not match X_val rows {X_val.shape[0]}"
-                )
-                self.logger.error(msg)
-                raise ValueError(msg)
+        pred_labels, pred_probas = self._predict(model=model, X=X, return_proba=True)
-            if eval_mask_override.shape[1] > X_val.shape[1]:
-                eval_mask = eval_mask_override[:, : X_val.shape[1]].astype(bool)
-            else:
-                eval_mask = eval_mask_override.astype(bool)
-        else:
-            eval_mask = X_val != -1
+        if pred_probas is None:
+            msg = "Predicted probabilities are None in _evaluate_model()."
+            self.logger.error(msg)
+            raise ValueError(msg)
-        # Combine masks
-        eval_mask = eval_mask & finite_mask & (GT_ref != -1)
+        y_true_flat = y[eval_mask].astype(np.int8, copy=False)
+        y_pred_flat = pred_labels[eval_mask].astype(np.int8, copy=False)
+        y_proba_flat = pred_probas[eval_mask].astype(np.float32, copy=False)
-        y_true_flat = GT_ref[eval_mask].astype(np.int64, copy=False)
-        y_pred_flat = pred_labels[eval_mask].astype(np.int64, copy=False)
-        y_proba_flat = pred_probas[eval_mask].astype(np.float64, copy=False)
+        valid = y_true_flat >= 0
+        y_true_flat = y_true_flat[valid]
+        y_pred_flat = y_pred_flat[valid]
+        y_proba_flat = y_proba_flat[valid]
         if y_true_flat.size == 0:
-            self.tune_metric = "f1" if self.tune_metric is None else self.tune_metric
             return {self.tune_metric: 0.0}
-        # ensure valid probability simplex after masking (no NaNs/Infs, sums=1)
+        if y_proba_flat.ndim != 2:
+            msg = f"Expected y_proba_flat to be 2D (n_eval, n_classes); got {y_proba_flat.shape}."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        K = int(y_proba_flat.shape[1])
+        if self.is_haploid_:
+            if K not in (2, 3):
+                msg = f"Haploid evaluation expects 2 or 3 classes; got {K}."
+                self.logger.error(msg)
+                raise ValueError(msg)
+        else:
+            if K != 3:
+                msg = f"Diploid evaluation expects 3 classes; got {K}."
+                self.logger.error(msg)
+                raise ValueError(msg)
+        if self.is_haploid_:
+            y_true_flat = (y_true_flat > 0).astype(np.int8, copy=False)
+            y_pred_flat = (y_pred_flat > 0).astype(np.int8, copy=False)
+            if K == 3:
+                proba_2 = np.empty((y_proba_flat.shape[0], 2), dtype=y_proba_flat.dtype)
+                proba_2[:, 0] = y_proba_flat[:, 0]
+                proba_2[:, 1] = y_proba_flat[:, 1] + y_proba_flat[:, 2]
+                y_proba_flat = proba_2
+            labels_for_scoring = [0, 1]
+            target_names = ["REF", "ALT"]
+        else:
+            labels_for_scoring = [0, 1, 2]
+            target_names = ["REF", "HET", "ALT"]
         y_proba_flat = np.clip(y_proba_flat, 0.0, 1.0)
         row_sums = y_proba_flat.sum(axis=1, keepdims=True)
-        row_sums[row_sums == 0] = 1.0
+        row_sums[row_sums == 0.0] = 1.0
         y_proba_flat = y_proba_flat / row_sums
-        labels_for_scoring = [0, 1] if self.is_haploid else [0, 1, 2]
-        target_names = ["REF", "ALT"] if self.is_haploid else ["REF", "HET", "ALT"]
-        if self.is_haploid:
-            y_true_flat = y_true_flat.copy()
-            y_pred_flat = y_pred_flat.copy()
-            y_true_flat[y_true_flat == 2] = 1
-            y_pred_flat[y_pred_flat == 2] = 1
-            # collapse probs to 2-class
-            proba_2 = np.zeros((len(y_proba_flat), 2), dtype=y_proba_flat.dtype)
-            proba_2[:, 0] = y_proba_flat[:, 0]
-            proba_2[:, 1] = y_proba_flat[:, 2]
-            y_proba_flat = proba_2
-        y_true_ohe = np.eye(len(labels_for_scoring))[y_true_flat]
-        tune_metric_tmp: Literal[
-            "pr_macro",
-            "roc_auc",
-            "average_precision",
-            "accuracy",
-            "f1",
-            "precision",
-            "recall",
-        ]
-        if self.tune_metric_ is not None:
-            tune_metric_tmp = self.tune_metric_
-        else:
-            tune_metric_tmp = "f1"  # Default if not tuning
+        y_true_ohe = np.eye(len(labels_for_scoring), dtype=np.int8)[y_true_flat]
         metrics = self.scorers_.evaluate(
             y_true_flat,
@@ -1055,16 +1136,29 @@ class ImputeAutoencoder(BaseNNImputer):
             y_true_ohe,
             y_proba_flat,
             objective_mode,
-            tune_metric_tmp,
+            cast(
+                Literal[
+                    "pr_macro",
+                    "roc_auc",
+                    "accuracy",
+                    "f1",
+                    "average_precision",
+                    "precision",
+                    "recall",
+                    "mcc",
+                    "jaccard",
+                ],
+                self.tune_metric,
+            ),
         )
         if not objective_mode:
-            pm = PrettyMetrics(
-                metrics, precision=3, title=f"{self.model_name} Validation Metrics"
-            )
-            pm.render()  # prints a command-line table
+            if self.verbose or self.debug:
+                pm = PrettyMetrics(
+                    metrics, precision=2, title=f"{self.model_name} Validation Metrics"
+                )
+                pm.render()
-            # Primary report (REF/HET/ALT or REF/ALT)
             self._make_class_reports(
                 y_true=y_true_flat,
                 y_pred_proba=y_proba_flat,
@@ -1073,18 +1167,15 @@ class ImputeAutoencoder(BaseNNImputer):
                 labels=target_names,
             )
-            # IUPAC decode & 10-base integer reports
-            # Now safe because GT_ref has been sliced to match X_val dimensions
-            y_true_dec = self.pgenc.decode_012(
-                GT_ref.reshape(X_val.shape[0], X_val.shape[1])
-            )
-            X_pred = X_val.copy()
-            X_pred[eval_mask] = y_pred_flat
+            y_true_matrix = np.array(y, copy=True)
+            y_pred_matrix = np.array(pred_labels, copy=True)
-            # Use X_val.shape[1] (current features) not self.num_features_ (original features)
-            y_pred_dec = self.pgenc.decode_012(
-                X_pred.reshape(X_val.shape[0], X_val.shape[1])
-            )
+            if self.is_haploid_:
+                y_true_matrix = np.where(y_true_matrix > 0, 2, y_true_matrix)
+                y_pred_matrix = np.where(y_pred_matrix > 0, 2, y_pred_matrix)
+            y_true_dec = self.decode_012(y_true_matrix)
+            y_pred_dec = self.decode_012(y_pred_matrix)
             encodings_dict = {
                 "A": 0,
@@ -1123,239 +1214,177 @@ class ImputeAutoencoder(BaseNNImputer):
         return metrics
     def _objective(self, trial: optuna.Trial) -> float:
-        """Optuna objective for AE; mirrors NLPCA study driver without latents.
-        This method defines the objective function for hyperparameter tuning using Optuna. It samples hyperparameters, prepares the training and validation data, builds and trains the autoencoder model, and evaluates its performance on the validation set. The method returns the value of the tuning metric to be maximized.
+        """Optuna objective for AE (mirrors VAE flow, excluding KL-specific parts).
         Args:
-            trial (optuna.Trial): Optuna trial.
+            trial (optuna.Trial): Optuna trial object.
         Returns:
-            float: Value of the tuning metric (maximize).
+            float: Value of the tuning metric to optimize.
         """
         try:
-            # Sample hyperparameters (existing helper; unchanged signature)
             params = self._sample_hyperparameters(trial)
-            # Optionally sub-sample for fast tuning (same keys used by NLPCA if you adopt them)
-            X_train = getattr(self, "X_train_", self.ground_truth_[self.train_idx_])
-            X_val = getattr(self, "X_val_", self.ground_truth_[self.test_idx_])
-            class_weights = self._normalize_class_weights(
-                self._class_weights_from_zygosity(X_train)
-            )
-            train_loader = self._get_data_loaders(X_train)
             model = self.build_model(self.Model, params["model_params"])
             model.apply(self.initialize_weights)
-            lr: float = float(params["lr"])
-            l1_penalty: float = float(params["l1_penalty"])
+            lr = float(params["learning_rate"])
+            l1_penalty = float(params["l1_penalty"])
+            class_weights = self._class_weights_from_zygosity(
+                self.y_train_,
+                train_mask=self.sim_mask_train_ & ~self.orig_mask_train_,
+                inverse=params["inverse"],
+                normalize=params["normalize"],
+                max_ratio=self.max_ratio if self.max_ratio is not None else 5.0,
+                power=params["power"],
+            )
-            # Train + prune on metric
-            _, model, __ = self._train_and_validate_model(
+            loss, model, _hist = self._train_and_validate_model(
                 model=model,
-                loader=train_loader,
                 lr=lr,
                 l1_penalty=l1_penalty,
+                params=params,
                 trial=trial,
-                return_history=False,
                 class_weights=class_weights,
-                X_val=X_val,
-                params=params,
-                prune_metric=self.tune_metric,
-                prune_warmup_epochs=10,
-                eval_interval=self.tune_eval_interval,
-                eval_requires_latents=False,
-                eval_latent_steps=0,
-                eval_latent_lr=0.0,
-                eval_latent_weight_decay=0.0,
+                gamma_schedule=params["gamma_schedule"],
             )
-            eval_mask = (
-                self.sim_mask_test_
-                if (
-                    self.simulate_missing
-                    and getattr(self, "sim_mask_test_", None) is not None
-                )
-                else None
-            )
+            if model is None or not np.isfinite(loss):
+                msg = "Model training returned None or non-finite loss in tuning objective."
+                self.logger.error(msg)
+                raise RuntimeError(msg)
-            if model is not None:
-                metrics = self._evaluate_model(
-                    X_val,
-                    model,
-                    params,
-                    objective_mode=True,
-                    eval_mask_override=eval_mask,
-                )
-                self._clear_resources(model, train_loader)
-            else:
-                raise TypeError("Model training failed; no model was returned.")
+            metrics = self._evaluate_model(
+                model=model,
+                X=self.X_val_,
+                y=self.y_val_,
+                eval_mask=self.sim_mask_val_ & ~self.orig_mask_val_,
+                objective_mode=True,
+            )
-            return metrics[self.tune_metric]
+            self._clear_resources(model)
+            return float(metrics[self.tune_metric])
         except Exception as e:
-            # Keep sweeps moving if a trial fails
-            raise optuna.exceptions.TrialPruned(f"Trial failed with error: {e}")
-    def _sample_hyperparameters(self, trial: optuna.Trial) -> Dict[str, Any]:
-        """Sample AE hyperparameters and compute hidden sizes for model params.
+            err_type = type(e).__name__
+            self.logger.warning(
+                f"Trial {trial.number} failed due to exception {err_type}: {e}"
+            )
+            self.logger.debug(traceback.format_exc())
+            raise optuna.exceptions.TrialPruned(
+                f"Trial {trial.number} failed due to an exception. {err_type}: {e}. "
+                "Enable debug logging for full traceback."
+            ) from e
-        This method samples hyperparameters for the autoencoder model using Optuna's trial object. It computes the hidden layer sizes based on the sampled parameters and prepares the model parameters dictionary.
+    def _sample_hyperparameters(self, trial: optuna.Trial) -> dict:
+        """Sample AE hyperparameters; hidden sizes mirror VAE helper (excluding KL).
         Args:
             trial (optuna.Trial): Optuna trial object.
         Returns:
-            Dict[str, int | float | str | bool]: Sampled hyperparameters and model_params.
+            dict: Sampled hyperparameters.
         """
         params = {
-            "latent_dim": trial.suggest_int("latent_dim", 4, 16, step=2),
-            "lr": trial.suggest_float("learning_rate", 3e-4, 1e-3, log=True),
-            "dropout_rate": trial.suggest_float("dropout_rate", 0.0, 0.30, step=0.05),
-            "num_hidden_layers": trial.suggest_int("num_hidden_layers", 1, 6),
+            "latent_dim": trial.suggest_int("latent_dim", 2, 32),
+            "learning_rate": trial.suggest_float("learning_rate", 3e-6, 1e-3, log=True),
+            "dropout_rate": trial.suggest_float("dropout_rate", 0.0, 0.5, step=0.025),
+            "num_hidden_layers": trial.suggest_int("num_hidden_layers", 1, 20),
             "activation": trial.suggest_categorical(
                 "activation", ["relu", "elu", "selu", "leaky_relu"]
             ),
             "l1_penalty": trial.suggest_float("l1_penalty", 1e-6, 1e-3, log=True),
             "layer_scaling_factor": trial.suggest_float(
-                "layer_scaling_factor", 2.0, 4.0, step=0.5
+                "layer_scaling_factor", 2.0, 10.0, step=0.025
             ),
             "layer_schedule": trial.suggest_categorical(
                 "layer_schedule", ["pyramid", "linear"]
             ),
+            "power": trial.suggest_float("power", 0.1, 2.0, step=0.1),
+            "normalize": trial.suggest_categorical("normalize", [True, False]),
+            "inverse": trial.suggest_categorical("inverse", [True, False]),
+            "gamma": trial.suggest_float("gamma", 0.0, 10.0, step=0.1),
+            "gamma_schedule": trial.suggest_categorical(
+                "gamma_schedule", [True, False]
+            ),
         }
-        nF: int = self.num_features_
-        nC: int = int(getattr(self, "output_classes_", self.num_classes_ or 3))
+        nF = int(self.num_features_)
+        nC = int(self.num_classes_)
         input_dim = nF * nC
         hidden_layer_sizes = self._compute_hidden_layer_sizes(
             n_inputs=input_dim,
-            n_outputs=input_dim,
+            n_outputs=nC,
             n_samples=len(self.train_idx_),
-            n_hidden=params["num_hidden_layers"],
-            alpha=params["layer_scaling_factor"],
-            schedule=params["layer_schedule"],
+            n_hidden=int(params["num_hidden_layers"]),
+            latent_dim=int(params["latent_dim"]),
+            alpha=float(params["layer_scaling_factor"]),
+            schedule=str(params["layer_schedule"]),
         )
-        # Keep the latent_dim as the first element,
-        # then the interior hidden widths.
-        # If there are no interior widths (very small nets),
-        # this still leaves [latent_dim].
-        hidden_only: list[int] = [hidden_layer_sizes[0]] + hidden_layer_sizes[1:-1]
         params["model_params"] = {
-            "n_features": int(self.num_features_),
-            "num_classes": int(
-                getattr(self, "output_classes_", self.num_classes_ or 3)
-            ),
+            "n_features": nF,
+            "num_classes": nC,
             "latent_dim": int(params["latent_dim"]),
             "dropout_rate": float(params["dropout_rate"]),
-            "hidden_layer_sizes": hidden_only,
+            "hidden_layer_sizes": hidden_layer_sizes,
             "activation": str(params["activation"]),
         }
         return params
-    def _set_best_params(
-        self, best_params: Dict[str, int | float | str | List[int]]
-    ) -> Dict[str, int | float | str | List[int]]:
-        """Adopt best params (ImputeNLPCA parity) and return model_params.
-        This method sets the best hyperparameters found during tuning and computes the hidden layer sizes for the autoencoder model. It prepares the final model parameters dictionary to be used for building the model.
+    def _set_best_params(self, params: dict) -> dict:
+        """Update instance fields from tuned params and return model_params dict.
         Args:
-            best_params (Dict[str, int | float | str | List[int]]): Best hyperparameters from tuning.
+            params (dict): Best hyperparameters from tuning.
         Returns:
-            Dict[str, int | float | str | List[int]]: Model parameters for building the model.
+            dict: Model parameters for building the final model.
         """
-        bp = {}
-        for k, v in best_params.items():
-            if not isinstance(v, list):
-                if k in {"latent_dim", "num_hidden_layers"}:
-                    bp[k] = int(v)
-                elif k in {
-                    "dropout_rate",
-                    "learning_rate",
-                    "l1_penalty",
-                    "layer_scaling_factor",
-                }:
-                    bp[k] = float(v)
-                elif k in {"activation", "layer_schedule"}:
-                    if k == "layer_schedule":
-                        if v not in {"pyramid", "constant", "linear"}:
-                            raise ValueError(f"Invalid layer_schedule: {v}")
-                        bp[k] = v
-                    else:
-                        bp[k] = str(v)
-            else:
-                bp[k] = v  # keep lists as-is
-        self.latent_dim: int = bp["latent_dim"]
-        self.dropout_rate: float = bp["dropout_rate"]
-        self.learning_rate: float = bp["learning_rate"]
-        self.l1_penalty: float = bp["l1_penalty"]
-        self.activation: str = bp["activation"]
-        self.layer_scaling_factor: float = bp["layer_scaling_factor"]
-        self.layer_schedule: str = bp["layer_schedule"]
-        nF: int = self.num_features_
-        nC: int = int(getattr(self, "output_classes_", self.num_classes_ or 3))
-        hidden_layer_sizes = self._compute_hidden_layer_sizes(
-            n_inputs=nF * nC,
-            n_outputs=nF * nC,
-            n_samples=len(self.train_idx_),
-            n_hidden=bp["num_hidden_layers"],
-            alpha=bp["layer_scaling_factor"],
-            schedule=bp["layer_schedule"],
+        self.latent_dim = int(params["latent_dim"])
+        self.dropout_rate = float(params["dropout_rate"])
+        self.learning_rate = float(params["learning_rate"])
+        self.l1_penalty = float(params["l1_penalty"])
+        self.activation = str(params["activation"])
+        self.layer_scaling_factor = float(params["layer_scaling_factor"])
+        self.layer_schedule = str(params["layer_schedule"])
+        self.power = float(params["power"])
+        self.normalize = bool(params["normalize"])
+        self.inverse = bool(params["inverse"])
+        self.gamma = float(params["gamma"])
+        self.gamma_schedule = bool(params["gamma_schedule"])
+        self.class_weights_ = self._class_weights_from_zygosity(
+            self.y_train_,
+            train_mask=self.sim_mask_train_ & ~self.orig_mask_train_,
+            inverse=self.inverse,
+            normalize=self.normalize,
+            max_ratio=self.max_ratio if self.max_ratio is not None else 5.0,
+            power=self.power,
         )
-        # Keep the latent_dim as the first element,
-        # then the interior hidden widths.
-        # If there are no interior widths (very small nets),
-        # this still leaves [latent_dim].
-        hidden_only = [hidden_layer_sizes[0]] + hidden_layer_sizes[1:-1]
-        return {
-            "n_features": self.num_features_,
-            "latent_dim": self.latent_dim,
-            "hidden_layer_sizes": hidden_only,
-            "dropout_rate": self.dropout_rate,
-            "activation": self.activation,
-            "num_classes": nC,
-        }
-    def _default_best_params(self) -> Dict[str, int | float | str | list]:
-        """Default model params when tuning is disabled.
-        This method computes the default model parameters for the autoencoder when hyperparameter tuning is not performed. It calculates the hidden layer sizes based on the initial configuration.
-        Returns:
-            Dict[str, int | float | str | list]: Default model parameters.
-        """
-        nF: int = self.num_features_
-        # Use the number of output channels passed to the model (2 for diploid multilabel)
-        # instead of the scoring classes (3) to keep layer shapes aligned.
-        nC: int = int(getattr(self, "output_classes_", self.num_classes_ or 3))
-        ls = self.layer_schedule
-        if ls not in {"pyramid", "constant", "linear"}:
-            raise ValueError(f"Invalid layer_schedule: {ls}")
+        nF = int(self.num_features_)
+        nC = int(self.num_classes_)
+        input_dim = nF * nC
         hidden_layer_sizes = self._compute_hidden_layer_sizes(
-            n_inputs=nF * nC,
-            n_outputs=nF * nC,
-            n_samples=len(self.ground_truth_),
-            n_hidden=self.num_hidden_layers,
-            alpha=self.layer_scaling_factor,
-            schedule=ls,
+            n_inputs=input_dim,
+            n_outputs=nC,
+            n_samples=len(self.train_idx_),
+            n_hidden=int(params["num_hidden_layers"]),
+            latent_dim=int(params["latent_dim"]),
+            alpha=float(params["layer_scaling_factor"]),
+            schedule=str(params["layer_schedule"]),
         )
         return {
-            "n_features": self.num_features_,
+            "n_features": nF,
+            "num_classes": nC,
             "latent_dim": self.latent_dim,
             "hidden_layer_sizes": hidden_layer_sizes,
             "dropout_rate": self.dropout_rate,
             "activation": self.activation,
-            "num_classes": nC,
         }

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl