PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/cli.py CHANGED Viewed

@@ -15,19 +15,21 @@ Notes
 Examples
 --------
-python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix run1
-python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix tuned --tune
-python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix demo \
-    --models ImputeUBP ImputeVAE ImputeMostFrequent --seed deterministic --verbose
-python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix subset \
-    --include-pops EA GU TT ON --device cpu
+pg-sui --vcf data.vcf.gz --popmap pops.popmap --prefix run1
+pg-sui --vcf data.vcf.gz --popmap pops.popmap --prefix tuned --tune
+pg-sui --vcf data.vcf.gz --popmap pops.popmap --prefix demo \
+    --models ImputeAutoencoder ImputeVAE ImputeMostFrequent --seed deterministic --verbose
+pg-sui --vcf data.vcf.gz --popmap pops.popmap --prefix subset \
+    --include-pops EA GU TT ON --device cpu --sim-prop 0.3 --sim-strategy nonrandom
 """
 from __future__ import annotations
 import argparse
 import ast
+import json
 import logging
+import os
 import sys
 import time
 from functools import wraps
@@ -46,20 +48,24 @@ from typing import (
     cast,
 )
-from snpio import GenePopReader, PhylipReader, SNPioMultiQC, VCFReader, TreeParser
+from snpio import (
+    GenePopReader,
+    NRemover2,
+    PhylipReader,
+    SNPioMultiQC,
+    StructureReader,
+    TreeParser,
+    VCFReader,
+)
 from pgsui import (
     AutoencoderConfig,
     ImputeAutoencoder,
     ImputeMostFrequent,
-    ImputeNLPCA,
     ImputeRefAllele,
-    ImputeUBP,
     ImputeVAE,
     MostFrequentConfig,
-    NLPCAConfig,
     RefAlleleConfig,
-    UBPConfig,
     VAEConfig,
 )
 from pgsui.data_processing.config import (
@@ -71,10 +77,8 @@ from pgsui.data_processing.config import (
 # Canonical model order used everywhere (default and subset ordering)
 MODEL_ORDER: Tuple[str, ...] = (
-    "ImputeUBP",
     "ImputeVAE",
     "ImputeAutoencoder",
-    "ImputeNLPCA",
     "ImputeMostFrequent",
     "ImputeRefAllele",
 )
@@ -93,6 +97,142 @@ R = TypeVar("R")
 # ----------------------------- CLI Utilities ----------------------------- #
+def _print_version() -> None:
+    """Print PG-SUI version and exit."""
+    from pgsui import __version__ as version
+    logging.info(f"Using PG-SUI version: {version}")
+def _model_family(model_name: str) -> str:
+    """Return output family folder name used by PG-SUI."""
+    if model_name in {"ImputeVAE", "ImputeAutoencoder"}:
+        return "Unsupervised"
+    if model_name in {"ImputeMostFrequent", "ImputeRefAllele"}:
+        return "Deterministic"
+    return "Unknown"
+def _flatten_dict(d: dict, parent: str = "") -> dict:
+    """Flatten a nested dict into dot keys."""
+    out: dict = {}
+    for k, v in (d or {}).items():
+        key = f"{parent}.{k}" if parent else str(k)
+        if isinstance(v, dict):
+            out.update(_flatten_dict(v, key))
+        else:
+            out[key] = v
+    return out
+def _force_tuning_off(cfg: Any, model_name: str) -> Any:
+    """Force tuning disabled on a config object (best-effort, but strict for tune-capable models)."""
+    # Prefer direct attribute mutation (avoids apply_dot_overrides edge-cases)
+    try:
+        if hasattr(cfg, "tune") and hasattr(cfg.tune, "enabled"):
+            cfg.tune.enabled = False
+            return cfg
+    except Exception:
+        pass
+    # Fallback to dot override
+    try:
+        return apply_dot_overrides(cfg, {"tune.enabled": False})
+    except Exception as e:
+        # Only strict for models that actually support tuning
+        if model_name in {"ImputeVAE", "ImputeAutoencoder"}:
+            raise RuntimeError(
+                f"Failed to force tuning off for {model_name}: {e}"
+            ) from e
+        return cfg
+def _find_best_params_json(prefix: str, model_name: str) -> Path | None:
+    """Locate best parameter JSON (tuned or final) for a model.
+    Args:
+        prefix (str): Output prefix used during the run.
+        model_name (str): Model name to look for.
+    Returns:
+        Path | None: Path to best_parameters.json / best_tuned_parameters.json if found; else None.
+    """
+    families = ("Unsupervised", "Deterministic")
+    model_dir_candidates = (model_name, model_name.lower())
+    for fam in families:
+        for mdir in model_dir_candidates:
+            base = Path(f"{prefix}_output") / fam
+            candidates = (
+                base / "optimize" / mdir / "parameters" / "best_tuned_parameters.json",
+                base / "parameters" / mdir / "best_parameters.json",
+            )
+            for p in candidates:
+                if p.exists():
+                    return p
+    return None
+def _load_best_params(best_params_path: Path) -> dict:
+    """Load best parameters JSON."""
+    with best_params_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"best_parameters.json must be a JSON object, got {type(data)}"
+        )
+    return data
+def _apply_best_params_to_cfg(cfg: Any, best_params: dict, model_name: str) -> Any:
+    """Apply best params into cfg using dot-path keys or inferred dot-paths.
+    - If JSON is nested, flatten to dot keys.
+    - If key already contains '.', treat as dot-path and apply directly.
+    - If key has no '.', try common sections in order: model., train., sim., tune., io., plot.
+    - Unknown keys are ignored with a warning.
+    """
+    # Flatten if nested (but keep existing dot keys as-is too)
+    flat = {}
+    for k, v in best_params.items():
+        if isinstance(v, dict):
+            flat.update(_flatten_dict(v, str(k)))
+        else:
+            flat[str(k)] = v
+    candidate_prefixes = ("", "model.", "train.", "sim.", "tune.", "io.", "plot.")
+    # Apply one by one so we can try multiple candidate destinations for
+    # non-dot keys
+    for raw_k, v in flat.items():
+        if "." in raw_k:
+            try:
+                cfg = apply_dot_overrides(cfg, {raw_k: v})
+                continue
+            except Exception as e:
+                logging.warning(
+                    f"Could not apply best param '{raw_k}' to {model_name} (dot key). Skipping. Error: {e}"
+                )
+                continue
+        applied = False
+        for pref in candidate_prefixes:
+            k = f"{pref}{raw_k}" if pref else raw_k
+            try:
+                cfg = apply_dot_overrides(cfg, {k: v})
+                applied = True
+                break
+            except Exception:
+                continue
+        if not applied:
+            logging.warning(
+                f"Best param '{raw_k}' not recognized for {model_name}; leaving config unchanged for that key."
+            )
+    return cfg
 def _configure_logging(verbose: bool, log_file: Optional[str] = None) -> None:
     """Configure root logger.
@@ -166,6 +306,96 @@ def _parse_overrides(pairs: list[str]) -> dict:
     return out
+def _parse_allele_encoding(arg: str) -> dict:
+    """Parse STRUCTURE allele encoding dict from JSON or Python literal."""
+    try:
+        payload = json.loads(arg)
+    except Exception:
+        try:
+            payload = ast.literal_eval(arg)
+        except Exception as e:
+            raise argparse.ArgumentTypeError(
+                f"Invalid --structure-allele-encoding; must be a dict. Error: {e}"
+            ) from e
+    if not isinstance(payload, dict):
+        raise argparse.ArgumentTypeError(
+            "--structure-allele-encoding must be a dict-like mapping."
+        )
+    out: dict = {}
+    for k, v in payload.items():
+        key = k
+        if isinstance(k, str):
+            k_strip = k.strip()
+            if k_strip.lstrip("-").isdigit():
+                try:
+                    key = int(k_strip)
+                except Exception:
+                    key = k
+        out[key] = str(v)
+    return out
+def _normalize_input_format(fmt: str) -> str:
+    """Normalize format aliases into canonical reader names."""
+    fmt = fmt.lower()
+    if fmt in {"vcf", "vcf.gz"}:
+        return "vcf"
+    if fmt in {"phy", "phylip"}:
+        return "phylip"
+    if fmt in {"gen", "genepop"}:
+        return "genepop"
+    if fmt in {"str", "structure"}:
+        return "structure"
+    return fmt
+def _normalize_plot_format(fmt: str) -> Literal["pdf", "png", "jpg", "svg"]:
+    """Normalize plot format aliases to reader-supported values."""
+    fmt = fmt.lower()
+    if fmt == "jpeg":
+        return "jpg"
+    return cast(Literal["pdf", "png", "jpg", "svg"], fmt)
+def _expand_path(path: str | None) -> str | None:
+    """Expand ~ and env vars in a path-like string."""
+    if path is None:
+        return None
+    raw = str(path).strip()
+    if not raw:
+        return None
+    expanded = os.path.expandvars(raw)
+    return str(Path(expanded).expanduser())
+def _resolve_tree_paths(
+    args: argparse.Namespace,
+) -> tuple[str | None, str | None, str | None]:
+    """Resolve tree-related paths from CLI args."""
+    treefile = _expand_path(getattr(args, "treefile", None))
+    qmatrix = _expand_path(getattr(args, "qmatrix", None))
+    siterates = _expand_path(getattr(args, "siterates", None))
+    return treefile, qmatrix, siterates
+def _config_needs_tree(cfg: Any | None) -> bool:
+    """Return True if config requires a tree parser for simulated missingness."""
+    if cfg is None:
+        return False
+    sim_cfg = getattr(cfg, "sim", None)
+    if sim_cfg is None:
+        return False
+    strategy = getattr(sim_cfg, "sim_strategy", None)
+    simulate = bool(getattr(sim_cfg, "simulate_missing", False))
+    return (
+        simulate
+        and isinstance(strategy, str)
+        and strategy in {"nonrandom", "nonrandom_weighted"}
+    )
 def _args_to_cli_overrides(args: argparse.Namespace) -> dict:
     """Convert explicitly provided CLI flags into config dot-overrides."""
     overrides: dict = {}
@@ -174,10 +404,12 @@ def _args_to_cli_overrides(args: argparse.Namespace) -> dict:
     if hasattr(args, "prefix") and args.prefix is not None:
         overrides["io.prefix"] = args.prefix
     else:
-        # Note: we don't know input_path here; prefix default is handled later.
-        # This fallback is preserved to avoid changing semantics.
-        if hasattr(args, "vcf"):
-            overrides["io.prefix"] = str(Path(args.vcf).stem)
+        # Prefer --input stem; fallback to legacy --vcf stem
+        input_path = getattr(args, "input", None)
+        if input_path is None and hasattr(args, "vcf"):
+            input_path = getattr(args, "vcf", None)
+        if input_path:
+            overrides["io.prefix"] = str(Path(input_path).stem)
     if hasattr(args, "verbose"):
         overrides["io.verbose"] = bool(args.verbose)
@@ -200,20 +432,34 @@ def _args_to_cli_overrides(args: argparse.Namespace) -> dict:
     # Plot
     if hasattr(args, "plot_format"):
         overrides["plot.fmt"] = args.plot_format
+    if getattr(args, "disable_plotting", False):
+        logging.info(
+            "Disabling plotting for all models as per --disable-plotting flag."
+        )
+        overrides["plot.show"] = False
-    # Simulation overrides (shared across config-driven models)
+    # Simulation overrides
     if hasattr(args, "sim_strategy"):
         overrides["sim.sim_strategy"] = args.sim_strategy
     if hasattr(args, "sim_prop"):
         overrides["sim.sim_prop"] = float(args.sim_prop)
-    if hasattr(args, "simulate_missing"):
-        overrides["sim.simulate_missing"] = bool(args.simulate_missing)
     # Tuning
-    if hasattr(args, "tune"):
-        overrides["tune.enabled"] = bool(args.tune)
-    if hasattr(args, "tune_n_trials"):
-        overrides["tune.n_trials"] = int(args.tune_n_trials)
+    if getattr(args, "load_best_params", False):
+        # Never allow CLI flags to re-enable tuning when loading params
+        if hasattr(args, "tune") and bool(getattr(args, "tune", False)):
+            logging.warning(
+                "--tune was supplied, but --load-best-params is active; ignoring --tune."
+            )
+        if hasattr(args, "tune_n_trials"):
+            logging.warning(
+                "--tune-n-trials was supplied, but --load-best-params is active; ignoring it."
+            )
+    else:
+        if hasattr(args, "tune"):
+            overrides["tune.enabled"] = bool(args.tune)
+        if hasattr(args, "tune_n_trials"):
+            overrides["tune.n_trials"] = int(args.tune_n_trials)
     return overrides
@@ -257,35 +503,77 @@ def log_model_time(fn: Callable[P, R]) -> Callable[P, R]:
 # ------------------------------ Core Runner ------------------------------ #
 def build_genotype_data(
     input_path: str,
-    fmt: Literal["vcf", "vcf.gz", "phy", "phylip", "genepop", "gen"],
+    fmt: Literal[
+        "vcf",
+        "vcf.gz",
+        "phy",
+        "phylip",
+        "genepop",
+        "gen",
+        "structure",
+        "str",
+    ],
     popmap_path: str | None,
     treefile: str | None,
     qmatrix: str | None,
     siterates: str | None,
     force_popmap: bool,
-    verbose: bool,
+    debug: bool,
     include_pops: List[str] | None,
-    plot_format: Literal["pdf", "png", "jpg", "jpeg"],
+    plot_format: Literal["pdf", "png", "jpg", "jpeg", "svg"],
+    structure_has_popids: bool = False,
+    structure_has_marker_names: bool = False,
+    structure_allele_start_col: int | None = None,
+    structure_allele_encoding: dict | None = None,
 ):
-    """Load genotype data from heterogeneous inputs."""
-    logging.info(f"Loading {fmt.upper()} and popmap data...")
+    """Load genotype data from heterogeneous inputs.
+    Args:
+        input_path (str): Path to genotype data file.
+        fmt (Literal): Format of genotype data file.
+        popmap_path (str | None): Optional path to population map file.
+        treefile (str | None): Optional path to phylogenetic tree file.
+        qmatrix (str | None): Optional path to IQ-TREE Q matrix file.
+        siterates (str | None): Optional path to SNP site rates file.
+        force_popmap (bool): Whether to force use of popmap even if samples don't match exactly.
+        debug (bool): Whether to enable debug-level logging in SNPio readers.
+        include_pops (List[str] | None): Optional list of population IDs to include.
+        plot_format (Literal): Figure format for SNPio plots.
+        structure_has_popids (bool): STRUCTURE only; whether pop IDs are present.
+        structure_has_marker_names (bool): STRUCTURE only; whether the first line has marker names.
+        structure_allele_start_col (int | None): STRUCTURE only; zero-based allele start column.
+        structure_allele_encoding (dict | None): STRUCTURE only; allele encoding map.
+    """
+    fmt_norm = _normalize_input_format(fmt)
+    plot_format = _normalize_plot_format(cast(str, plot_format))
+    logging.info(f"Loading {fmt_norm.upper()} and popmap data...")
     kwargs = {
         "filename": input_path,
         "popmapfile": popmap_path,
         "force_popmap": force_popmap,
-        "verbose": verbose,
+        "verbose": debug,
         "include_pops": include_pops if include_pops else None,
         "prefix": f"snpio_{Path(input_path).stem}",
         "plot_format": plot_format,
     }
-    if fmt == "vcf":
+    if fmt_norm == "vcf":
         gd = VCFReader(**kwargs)
-    elif fmt == "phylip":
+    elif fmt_norm == "phylip":
         gd = PhylipReader(**kwargs)
-    elif fmt == "genepop":
+    elif fmt_norm == "genepop":
         gd = GenePopReader(**kwargs)
+    elif fmt_norm == "structure":
+        kwargs.update(
+            {
+                "has_popids": structure_has_popids,
+                "has_marker_names": structure_has_marker_names,
+                "allele_start_col": structure_allele_start_col,
+                "allele_encoding": structure_allele_encoding,
+            }
+        )
+        gd = StructureReader(**kwargs)
     else:
         raise ValueError(f"Unsupported genotype data format: {fmt}")
@@ -321,8 +609,6 @@ def run_model_safely(model_name: str, builder, *, warn_only: bool = True) -> Non
 # -------------------------- Model Registry ------------------------------- #
 # Add config-driven models here by listing the class and its config dataclass.
 MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
-    "ImputeUBP": {"cls": ImputeUBP, "config_cls": UBPConfig},
-    "ImputeNLPCA": {"cls": ImputeNLPCA, "config_cls": NLPCAConfig},
     "ImputeAutoencoder": {"cls": ImputeAutoencoder, "config_cls": AutoencoderConfig},
     "ImputeVAE": {"cls": ImputeVAE, "config_cls": VAEConfig},
     "ImputeMostFrequent": {"cls": ImputeMostFrequent, "config_cls": MostFrequentConfig},
@@ -372,24 +658,65 @@ def _build_effective_config_for_model(
             f"Loaded YAML config for {model_name} from {yaml_path} (ignored 'preset' in YAML if present)."
         )
-    # 3) Explicit CLI flags overlay YAML.
+    # 3) Optional: load best parameters from a previous run and force tuning OFF.
+    if getattr(args, "load_best_params", False):
+        # Determine which prefix to look under for *_output
+        src_prefix = getattr(args, "best_params_prefix", None)
+        if src_prefix is None:
+            # Use the resolved prefix if provided; otherwise fall back to input
+            # stem behavior
+            src_prefix = getattr(args, "prefix", None)
+            if src_prefix is None and hasattr(args, "vcf"):
+                src_prefix = str(Path(args.vcf).stem)
+            if src_prefix is None:
+                # As a last resort, use current effective io.prefix if it exists in cfg
+                src_prefix = getattr(getattr(cfg, "io", object()), "prefix", None)
+        if getattr(args, "tune", False):
+            logging.warning(
+                "--tune was supplied, but --load-best-params is active; forcing tuning OFF."
+            )
+        # Force tuning disabled in config (even if CLI/YAML enabled it)
+        cfg = _force_tuning_off(cfg, model_name)
+        best_path = _find_best_params_json(str(src_prefix), model_name)
+        if best_path is None:
+            # For tune-capable (unsupervised) models, treat as an error; deterministic models warn only.
+            fam = _model_family(model_name)
+            msg = (
+                "Requested --load-best-params, but could not find a best parameters JSON "
+                f"for {model_name}. Looked under '.../optimize/<model>/parameters/best_tuned_parameters.json' and '{src_prefix}_output/{fam}/parameters/{model_name}/best_parameters.json'"
+            )
+            if model_name in {"ImputeVAE", "ImputeAutoencoder"}:
+                logging.error(msg)
+                raise FileNotFoundError(msg)
+            logging.warning(msg)
+        else:
+            logging.info(f"Loading best parameters for {model_name} from: {best_path}")
+            best_params = _load_best_params(best_path)
+            cfg = _apply_best_params_to_cfg(cfg, best_params, model_name)
+            cfg = _force_tuning_off(cfg, model_name)
+    # 4) Explicit CLI flags overlay YAML/best-params layers.
     cli_overrides = _args_to_cli_overrides(args)
     if cli_overrides:
         cfg = apply_dot_overrides(cfg, cli_overrides)
-    # 4) --set has highest precedence.
+    # Keep tuning disabled if --load-best-params was requested, even if CLI flags tried to re-enable it.
+    if getattr(args, "load_best_params", False):
+        cfg = _force_tuning_off(cfg, model_name)
+    # 5) --set has highest precedence.
     user_overrides = _parse_overrides(getattr(args, "set", []))
     if user_overrides:
         try:
             cfg = apply_dot_overrides(cfg, user_overrides)
         except Exception as e:
-            if model_name in {
-                "ImputeUBP",
-                "ImputeNLPCA",
-                "ImputeAutoencoder",
-                "ImputeVAE",
-            }:
+            if model_name in {"ImputeAutoencoder", "ImputeVAE"}:
                 logging.error(
                     f"Error applying --set overrides to {model_name} config: {e}"
                 )
@@ -397,6 +724,18 @@ def _build_effective_config_for_model(
             else:
                 pass  # non-config-driven models ignore --set
+        # FINAL GUARANTEE:
+        # --load-best-params always wins over
+        # --set, YAML, preset, and CLI flags.
+        if getattr(args, "load_best_params", False):
+            # If user explicitly tried to set tune.* via --set, warn and override.
+            if any(str(k).startswith("tune.") for k in (user_overrides or {}).keys()):
+                logging.warning(
+                    f"{model_name}: '--set tune.*=...' was provided, but --load-best-params forces tuning OFF. "
+                    "Ignoring any tune.* overrides."
+                )
+            cfg = _force_tuning_off(cfg, model_name)
     return cfg
@@ -437,9 +776,19 @@ def _maybe_print_or_dump_configs(
 def main(argv: Optional[List[str]] = None) -> int:
+    """PG-SUI CLI main entry point.
+    The CLI supports running multiple imputation models on a single input file, with configuration handled via presets, YAML files, and CLI flags.
+    Args:
+        argv (Optional[List[str]]): List of CLI args (default: sys.argv[1:]).
+    Returns:
+        int: Exit code (0=success, 2=argparse error, 1=other error).
+    """
     parser = argparse.ArgumentParser(
         prog="pg-sui",
-        description="Run PG-SUI imputation models on an input file. Handle configuration via presets, YAML, and CLI flags. The default is to run all models.",
+        description="Run PG-SUI imputation models on an input file. Handle configuration via presets, YAML, and CLI flags. The default is to run all models. The input file can be in VCF, PHYLIP, or GENEPOP format. Outputs include imputed genotype files and performance summaries.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         usage="%(prog)s [options]",
     )
@@ -448,7 +797,7 @@ def main(argv: Optional[List[str]] = None) -> int:
     parser.add_argument(
         "--input",
         default=argparse.SUPPRESS,
-        help="Path to input file (VCF/PHYLIP/STRUCTURE/GENEPOP).",
+        help="Path to input file (VCF/PHYLIP/GENEPOP). VCF file can be bgzipped or uncompressed.",
     )
     parser.add_argument(
         "--format",
@@ -464,23 +813,23 @@ def main(argv: Optional[List[str]] = None) -> int:
             "gen",
         ),
         default=argparse.SUPPRESS,
-        help="Input format. If 'infer', deduced from file extension. The default is 'infer'.",
+        help="Input format. If 'infer', deduced from file extension. The default is 'infer'. Supported formats: VCF ('.vcf', '.vcf.gz'), PHYLIP ('.phy', '.phylip'), GENEPOP ('.genepop', '.gen').",
     )
     # Back-compat: --vcf retained; if both provided, --input wins.
     parser.add_argument(
         "--vcf",
         default=argparse.SUPPRESS,
-        help="Path to input VCF file. Can be bgzipped or uncompressed.",
+        help="Path to input VCF file. Can be bgzipped or uncompressed. (Deprecated; use --input instead.)",
     )
     parser.add_argument(
         "--popmap",
         default=argparse.SUPPRESS,
-        help="Path to population map file. This is a two-column tab-delimited file with sample IDs and population IDs.",
+        help="Path to population map file. This is a two-column tab-delimited file with sample IDs and population IDs. If not provided, no population info is used.",
     )
     parser.add_argument(
         "--treefile",
         default=argparse.SUPPRESS,
-        help="Path to phylogenetic tree file. Can be in Newick (recommended) or Nexus format.",
+        help="Path to phylogenetic tree file. Can be in Newick (recommended) or Nexus format. Used with --qmatrix and --siterates.",
     )
     parser.add_argument(
         "--qmatrix",
@@ -490,19 +839,19 @@ def main(argv: Optional[List[str]] = None) -> int:
     parser.add_argument(
         "--siterates",
         default=argparse.SUPPRESS,
-        help="Path to SNP site rates file (has .rate extension). Used with --treefile and --qmatrix.",
+        help="Path to SNP site rates file (has .rate extension and can be produced with IQ-TREE). Used with --treefile and --qmatrix.",
     )
     parser.add_argument(
         "--prefix",
         default=argparse.SUPPRESS,
-        help="Output file prefix.",
+        help="Output file prefix. If not provided, defaults to the input file stem.",
     )
     # ---------------------- Generic Config Inputs -------------------------- #
     parser.add_argument(
         "--config",
         default=argparse.SUPPRESS,
-        help="YAML config for config-driven models (NLPCA/UBP/Autoencoder/VAE).",
+        help="YAML config for config-driven models (Autoencoder, VAE). Overrides preset and defaults.",
     )
     parser.add_argument(
         "--preset",
@@ -514,7 +863,7 @@ def main(argv: Optional[List[str]] = None) -> int:
         "--set",
         action="append",
         default=argparse.SUPPRESS,
-        help="Dot-key overrides, e.g. --set model.latent_dim=4",
+        help="Dot-key overrides, e.g. --set model.latent_dim=4 --set train.epochs=100. Applies to all models.",
     )
     parser.add_argument(
         "--print-config",
@@ -532,7 +881,7 @@ def main(argv: Optional[List[str]] = None) -> int:
         "--tune",
         action="store_true",
         default=argparse.SUPPRESS,
-        help="Enable hyperparameter tuning (if supported).",
+        help="Enable hyperparameter tuning (if supported by model). Uses Optuna to optimize hyperparameters.",
     )
     parser.add_argument(
         "--tune-n-trials",
@@ -562,8 +911,31 @@ def main(argv: Optional[List[str]] = None) -> int:
         "--plot-format",
         choices=("png", "pdf", "svg", "jpg", "jpeg"),
         default=argparse.SUPPRESS,
-        help="Figure format for model plots.",
+        help="Figure format for model plots. Choices: png, pdf, svg, jpg, jpeg.",
+    )
+    parser.add_argument(
+        "--disable-plotting",
+        action="store_true",
+        default=False,
+        help="Disable plotting for all models. Overrides any config settings enabling plotting.",
+    )
+    parser.add_argument(
+        "--load-best-params",
+        action="store_true",
+        default=False,
+        help=(
+            "Load best hyperparameters from a previous run's best_parameters.json (or tuning best_tuned_parameters.json) for each selected model and apply them to the model configs. This forces tuning OFF."
+        ),
     )
+    parser.add_argument(
+        "--best-params-prefix",
+        default=argparse.SUPPRESS,
+        help=(
+            "Prefix of the PREVIOUS run to load best parameters from. If omitted, uses the current --prefix (or input stem)."
+        ),
+    )
     # ------------------------- Simulation Controls ------------------------ #
     parser.add_argument(
         "--sim-strategy",
@@ -577,19 +949,15 @@ def main(argv: Optional[List[str]] = None) -> int:
         default=argparse.SUPPRESS,
         help="Override the proportion of observed entries to mask during simulation (0-1).",
     )
-    parser.add_argument(
-        "--simulate-missing",
-        action="store_false",
-        default=argparse.SUPPRESS,
-        help="Disable missing-data simulation regardless of preset/config (when provided).",
-    )
     # --------------------------- Seed & logging ---------------------------- #
     parser.add_argument(
         "--seed",
         default=argparse.SUPPRESS,
-        help="Random seed: 'random', 'deterministic', or an integer.",
+        help="Random seed: 'random', 'deterministic', or an integer. Default is 'random'.",
     )
+    # ----------------------------- Logging --------------------------------- #
     parser.add_argument("--verbose", action="store_true", help="Info-level logging.")
     parser.add_argument("--debug", action="store_true", help="Debug-level logging.")
     parser.add_argument(
@@ -607,7 +975,33 @@ def main(argv: Optional[List[str]] = None) -> int:
         "--force-popmap",
         action="store_true",
         default=False,
-        help="Require popmap (error if absent).",
+        help="Force use of provided popmap even if samples don't match exactly. This will drop samples not in the popmap and vice versa.",
+    )
+    # -------------------------- STRUCTURE options ------------------------- #
+    parser.add_argument(
+        "--structure-has-popids",
+        action="store_true",
+        default=False,
+        help="STRUCTURE only: second column contains population IDs.",
+    )
+    parser.add_argument(
+        "--structure-has-marker-names",
+        action="store_true",
+        default=False,
+        help="STRUCTURE only: first row contains marker names.",
+    )
+    parser.add_argument(
+        "--structure-allele-start-col",
+        type=int,
+        default=argparse.SUPPRESS,
+        help="STRUCTURE only: zero-based column index where alleles begin.",
+    )
+    parser.add_argument(
+        "--structure-allele-encoding",
+        type=_parse_allele_encoding,
+        default=argparse.SUPPRESS,
+        help="STRUCTURE only: allele encoding mapping as JSON or Python dict.",
     )
     # ---------------------------- Model selection -------------------------- #
@@ -616,54 +1010,66 @@ def main(argv: Optional[List[str]] = None) -> int:
         nargs="+",
         default=argparse.SUPPRESS,
         help=(
-            "Which models to run. Choices: ImputeUBP ImputeVAE ImputeAutoencoder ImputeNLPCA ImputeMostFrequent ImputeRefAllele. Default is all."
+            "Which models to run. Specify each model separated by a space. Choices: ImputeVAE ImputeAutoencoder ImputeMostFrequent ImputeRefAllele (Default is all models)."
         ),
     )
     # -------------------------- MultiQC integration ------------------------ #
     parser.add_argument(
-        "--multiqc",
+        "--disable-multiqc",
         action="store_true",
+        default=False,
         help=(
-            "Build a MultiQC HTML report at the end of the run, combining SNPio and PG-SUI plots (requires SNPio's MultiQC module)."
+            "Disable MultiQC report generation after imputation. By default, a MultiQC report is generated unless this flag is set."
         ),
     )
     parser.add_argument(
         "--multiqc-title",
         default=argparse.SUPPRESS,
-        help="Optional title for the MultiQC report (default: 'PG-SUI MultiQC Report - <prefix>').",
+        help="Optional title for the MultiQC report (default: 'PG-SUI MultiQC Report - <prefix>'). ",
     )
     parser.add_argument(
         "--multiqc-output-dir",
         default=argparse.SUPPRESS,
-        help="Optional output directory for the MultiQC report (default: '<prefix>_output/multiqc').",
+        help="Optional output directory for the MultiQC report (default: '<prefix>_output/multiqc'). This directory will be created if it does not exist.",
     )
     parser.add_argument(
         "--multiqc-overwrite",
         action="store_true",
         default=False,
-        help="Overwrite an existing MultiQC report if present.",
+        help="Overwrite an existing MultiQC report if present. If not set and a report exists, an integer suffix will be added to avoid overwriting. NOTE: if running multiple times with this flag, it may append multiple suffixes to avoid overwriting previous reports.",
     )
     # ------------------------------ Safety/UX ------------------------------ #
     parser.add_argument(
         "--dry-run",
         action="store_true",
-        help="Parse args and load data, but skip model training.",
+        help="Parse args and load data, but skip model training. Useful for testing I/O and configs.",
+    )
+    parser.add_argument(
+        "--version", action="store_true", help="Print PG-SUI version and exit."
     )
     args = parser.parse_args(argv)
+    if getattr(args, "version", False):
+        _print_version()
+        return 0
     # Logging (verbose default is False unless passed)
     _configure_logging(
         verbose=getattr(args, "verbose", False),
         log_file=getattr(args, "log_file", None),
     )
+    logging.info("Starting PG-SUI imputation...")
+    _print_version()
     # Models selection (default to all if not explicitly provided)
     try:
         selected_models = _parse_models(getattr(args, "models", ()))
     except argparse.ArgumentTypeError as e:
+        logging.error(str(e))
         parser.error(str(e))
         return 2
@@ -675,12 +1081,11 @@ def main(argv: Optional[List[str]] = None) -> int:
             setattr(args, "format", "vcf")
     if input_path is None:
+        logging.error("You must provide --input (or legacy --vcf).")
         parser.error("You must provide --input (or legacy --vcf).")
         return 2
-    fmt: Literal["infer", "vcf", "vcf.gz", "phy", "phylip", "genepop", "gen"] = getattr(
-        args, "format", "infer"
-    )
+    fmt = getattr(args, "format", "infer")
     if fmt == "infer":
         if input_path.endswith((".vcf", ".vcf.gz")):
@@ -689,28 +1094,59 @@ def main(argv: Optional[List[str]] = None) -> int:
             fmt_final = "phylip"
         elif input_path.endswith((".genepop", ".gen")):
             fmt_final = "genepop"
+        elif input_path.endswith((".str", ".stru", ".structure")):
+            fmt_final = "structure"
         else:
+            logging.error(
+                "Could not infer input format from file extension. Please provide --format."
+            )
             parser.error(
                 "Could not infer input format from file extension. Please provide --format."
             )
             return 2
     else:
-        fmt_final = fmt
+        fmt_final = cast(
+            Literal[
+                "vcf",
+                "vcf.gz",
+                "phy",
+                "phylip",
+                "genepop",
+                "gen",
+                "structure",
+                "str",
+            ],
+            fmt,
+        )
+    fmt_final = _normalize_input_format(fmt_final)
     popmap_path = getattr(args, "popmap", None)
     include_pops = getattr(args, "include_pops", None)
-    verbose_flag = getattr(args, "verbose", False)
     force_popmap = bool(getattr(args, "force_popmap", False))
+    structure_has_popids = bool(getattr(args, "structure_has_popids", False))
+    structure_has_marker_names = bool(
+        getattr(args, "structure_has_marker_names", False)
+    )
+    structure_allele_start_col = getattr(args, "structure_allele_start_col", None)
+    structure_allele_encoding = getattr(args, "structure_allele_encoding", None)
     # Canonical prefix for this run (used for outputs and MultiQC)
     prefix: str = getattr(args, "prefix", str(Path(input_path).stem))
+    # Ensure downstream config building sees the resolved prefix even if
+    # --prefix was not provided.
+    setattr(args, "prefix", prefix)
-    treefile = getattr(args, "treefile", None)
-    qmatrix = getattr(args, "qmatrix", None)
-    siterates = getattr(args, "siterates", None)
+    treefile, qmatrix, siterates = _resolve_tree_paths(args)
+    setattr(args, "treefile", treefile)
+    setattr(args, "qmatrix", qmatrix)
+    setattr(args, "siterates", siterates)
     if any(x is not None for x in (treefile, qmatrix, siterates)):
         if not all(x is not None for x in (treefile, qmatrix, siterates)):
+            logging.error(
+                "--treefile, --qmatrix, and --siterates must all be provided together or they should all be omitted."
+            )
             parser.error(
                 "--treefile, --qmatrix, and --siterates must all be provided together or they should all be omitted."
             )
@@ -719,15 +1155,31 @@ def main(argv: Optional[List[str]] = None) -> int:
     # Load genotype data
     gd, tp = build_genotype_data(
         input_path=input_path,
-        fmt=fmt_final,
+        fmt=cast(
+            Literal[
+                "vcf",
+                "vcf.gz",
+                "phy",
+                "phylip",
+                "genepop",
+                "gen",
+                "structure",
+                "str",
+            ],
+            fmt_final,
+        ),
         popmap_path=popmap_path,
         treefile=treefile,
         qmatrix=qmatrix,
         siterates=siterates,
         force_popmap=force_popmap,
-        verbose=verbose_flag,
         include_pops=include_pops,
+        debug=getattr(args, "debug", False),
         plot_format=getattr(args, "plot_format", "pdf"),
+        structure_has_popids=structure_has_popids,
+        structure_has_marker_names=structure_has_marker_names,
+        structure_allele_start_col=structure_allele_start_col,
+        structure_allele_encoding=structure_allele_encoding,
     )
     if getattr(args, "dry_run", False):
@@ -739,47 +1191,33 @@ def main(argv: Optional[List[str]] = None) -> int:
         m: _build_effective_config_for_model(m, args) for m in selected_models
     }
+    needs_tree = any(
+        _config_needs_tree(cfg) for cfg in cfgs_by_model.values() if cfg is not None
+    )
+    if needs_tree and not all(x is not None for x in (treefile, qmatrix, siterates)):
+        logging.error(
+            "Nonrandom simulated missingness requires --treefile, --qmatrix, and --siterates."
+        )
+        parser.error(
+            "Nonrandom simulated missingness requires --treefile, --qmatrix, and --siterates."
+        )
+        return 2
+    if needs_tree and tp is None:
+        logging.error(
+            "Tree parser was not initialized for nonrandom simulation. "
+            "Please verify --treefile, --qmatrix, and --siterates."
+        )
+        parser.error(
+            "Tree parser was not initialized for nonrandom simulation. "
+            "Please verify --treefile, --qmatrix, and --siterates."
+        )
+        return 2
     # Maybe print/dump configs and exit
     if _maybe_print_or_dump_configs(cfgs_by_model, args):
         return 0
     # ------------------------- Model Builders ------------------------------ #
-    def build_impute_ubp():
-        cfg = cfgs_by_model.get("ImputeUBP")
-        if cfg is None:
-            cfg = (
-                UBPConfig.from_preset(args.preset)
-                if hasattr(args, "preset")
-                else UBPConfig()
-            )
-        return ImputeUBP(
-            genotype_data=gd,
-            tree_parser=tp,
-            config=cfg,
-            simulate_missing=cfg.sim.simulate_missing,
-            sim_strategy=cfg.sim.sim_strategy,
-            sim_prop=cfg.sim.sim_prop,
-            sim_kwargs=cfg.sim.sim_kwargs,
-        )
-    def build_impute_nlpca():
-        cfg = cfgs_by_model.get("ImputeNLPCA")
-        if cfg is None:
-            cfg = (
-                NLPCAConfig.from_preset(args.preset)
-                if hasattr(args, "preset")
-                else NLPCAConfig()
-            )
-        return ImputeNLPCA(
-            genotype_data=gd,
-            tree_parser=tp,
-            config=cfg,
-            simulate_missing=cfg.sim.simulate_missing,
-            sim_strategy=cfg.sim.sim_strategy,
-            sim_prop=cfg.sim.sim_prop,
-            sim_kwargs=cfg.sim.sim_kwargs,
-        )
     def build_impute_vae():
         cfg = cfgs_by_model.get("ImputeVAE")
         if cfg is None:
@@ -792,7 +1230,6 @@ def main(argv: Optional[List[str]] = None) -> int:
             genotype_data=gd,
             tree_parser=tp,
             config=cfg,
-            simulate_missing=cfg.sim.simulate_missing,
             sim_strategy=cfg.sim.sim_strategy,
             sim_prop=cfg.sim.sim_prop,
             sim_kwargs=cfg.sim.sim_kwargs,
@@ -810,7 +1247,6 @@ def main(argv: Optional[List[str]] = None) -> int:
             genotype_data=gd,
             tree_parser=tp,
             config=cfg,
-            simulate_missing=cfg.sim.simulate_missing,
             sim_strategy=cfg.sim.sim_strategy,
             sim_prop=cfg.sim.sim_prop,
             sim_kwargs=cfg.sim.sim_kwargs,
@@ -828,7 +1264,7 @@ def main(argv: Optional[List[str]] = None) -> int:
             gd,
             tree_parser=tp,
             config=cfg,
-            simulate_missing=cfg.sim.simulate_missing,
+            simulate_missing=True,
             sim_strategy=cfg.sim.sim_strategy,
             sim_prop=cfg.sim.sim_prop,
             sim_kwargs=cfg.sim.sim_kwargs,
@@ -846,34 +1282,37 @@ def main(argv: Optional[List[str]] = None) -> int:
             gd,
             tree_parser=tp,
             config=cfg,
-            simulate_missing=cfg.sim.simulate_missing,
+            simulate_missing=True,
             sim_strategy=cfg.sim.sim_strategy,
             sim_prop=cfg.sim.sim_prop,
             sim_kwargs=cfg.sim.sim_kwargs,
         )
     model_builders = {
-        "ImputeUBP": build_impute_ubp,
         "ImputeVAE": build_impute_vae,
         "ImputeAutoencoder": build_impute_autoencoder,
-        "ImputeNLPCA": build_impute_nlpca,
         "ImputeMostFrequent": build_impute_mostfreq,
         "ImputeRefAllele": build_impute_refallele,
     }
     logging.info(f"Selected models: {', '.join(selected_models)}")
     for name in selected_models:
+        logging.info("")
+        logging.info("=" * 60)
+        logging.info("")
+        logging.info(f"Processing model: {name} ...")
         X_imputed = run_model_safely(name, model_builders[name], warn_only=False)
         gd_imp = gd.copy()
         gd_imp.snp_data = X_imputed
-        if name in {"ImputeUBP", "ImputeVAE", "ImputeAutoencoder", "ImputeNLPCA"}:
+        if name in {"ImputeVAE", "ImputeAutoencoder"}:
             family = "Unsupervised"
         elif name in {"ImputeMostFrequent", "ImputeRefAllele"}:
             family = "Deterministic"
         elif name in {"ImputeHistGradientBoosting", "ImputeRandomForest"}:
             family = "Supervised"
         else:
+            logging.error(f"Unknown model family for {name}")
             raise ValueError(f"Unknown model family for {name}")
         pth = Path(f"{prefix}_output/{family}/imputed/{name}")
@@ -892,7 +1331,19 @@ def main(argv: Optional[List[str]] = None) -> int:
                 f"Output format {fmt_final} not supported for imputed data export."
             )
-    logging.info("All requested models processed.")
+        logging.info("")
+        logging.info(f"Successfully finished imputation for model: {name}!")
+        logging.info("")
+        logging.info("=" * 60)
+    logging.info(f"All requested models processed for input: {input_path}")
+    disable_mqc = bool(getattr(args, "disable_multiqc", False))
+    if disable_mqc:
+        logging.info("MultiQC report generation disabled via --disable-multiqc.")
+        logging.info("PG-SUI imputation run complete!")
+        return 0
     # -------------------------- MultiQC builder ---------------------------- #
@@ -912,9 +1363,10 @@ def main(argv: Optional[List[str]] = None) -> int:
             overwrite=overwrite,
         )
         logging.info("MultiQC report successfully built.")
-    except Exception as exc2:  # pragma: no cover
+    except Exception as exc2:
         logging.error(f"Failed to build MultiQC report: {exc2}", exc_info=True)
+    logging.info("PG-SUI imputation run complete!")
     return 0

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl