PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/utils/misc.py CHANGED Viewed

@@ -66,8 +66,206 @@ def validate_input_type(
         if isinstance(X, torch.Tensor):
             return X
         elif isinstance(X, np.ndarray):
-            return torch.tensor(X, dtype=torch.float32)
+            return torch.tensor(X, dtype=torch.long)
         elif isinstance(X, pd.DataFrame):
-            return torch.tensor(X.to_numpy(), dtype=torch.float32)
+            return torch.tensor(X.to_numpy(), dtype=torch.long)
         elif isinstance(X, list):
-            return torch.tensor(X, dtype=torch.float32)
+            return torch.tensor(X, dtype=torch.long)
+def detect_computing_device(
+    *, force_cpu: bool = False, verbose: bool = False
+) -> torch.device:
+    """Detects and returns the best available PyTorch compute device.
+    Prioritizes CUDA (NVIDIA) > MPS (Apple Silicon) > CPU.
+    Args:
+        force_cpu (bool): If True, forces the device to CPU regardless of available hardware. Defaults to False.
+        verbose (bool): If True, prints the selected device to stdout. Defaults to False.
+    Returns:
+        torch.device: The selected computing device.
+    """
+    if force_cpu:
+        device = torch.device("cpu")  # Forced to CPU
+    elif torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")  # Fallback to CPU
+    if verbose:
+        print(f"Selected compute device: {device}")
+    return device
+def get_missing_mask(
+    X: pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor,
+) -> pd.DataFrame | pd.Series | np.ndarray | torch.Tensor:
+    """Returns a boolean mask indicating missing values (NaN, None).
+    Notes:
+    Lists are converted to numpy arrays to compute the mask.
+    Args:
+        X: Input data.
+    Returns:
+        pd.DataFrame | pd.Series | np.ndarray | torch.Tensor: Boolean mask of the same shape as X (returned as DF, Array, or Tensor).
+    Raises:
+        TypeError: If input type is not supported.
+    """
+    if isinstance(X, pd.DataFrame):
+        return X.isna()
+    elif isinstance(X, pd.Series):
+        return pd.isna(X)
+    elif isinstance(X, np.ndarray):
+        # np.isnan fails on object arrays (e.g. strings)
+        # so we check generically first
+        if X.dtype.kind in {"U", "S", "O"}:  # String/Object
+            return pd.isnull(X)
+        return np.isnan(X)
+    elif isinstance(X, torch.Tensor):
+        return torch.isnan(X)
+    elif isinstance(X, list):
+        arr = np.array(X)
+        # Handle mixed types in lists
+        if arr.dtype.kind in {"U", "S", "O"}:
+            return pd.isnull(arr)
+        return np.isnan(arr)
+    else:
+        raise TypeError(
+            f"Unsupported type for missing value detection. Expected pandas.DataFrame, pandas.Series, numpy.ndarray, list, or torch.Tensor but got {type(X)}"
+        )
+def ensure_2d(
+    X: pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor,
+) -> pd.DataFrame | np.ndarray | list | torch.Tensor:
+    """Ensures the input is at least 2-dimensional.
+    If input is 1D (e.g., shape (N,)), it is reshaped to (N, 1). Already 2D+ inputs are returned unchanged.
+    Args:
+        X (pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor): Input data.
+    Returns:
+        pd.DataFrame | np.ndarray | list | torch.Tensor: Input data transformed to be at least 2D.
+    Raises:
+        TypeError: If input type is not supported.
+    """
+    if isinstance(X, pd.DataFrame):
+        return X  # DataFrames are always 2D
+    elif isinstance(X, pd.Series):
+        return X.to_frame()  # Convert Series to DataFrame (2D)
+    elif isinstance(X, np.ndarray):
+        if X.ndim == 1:
+            return X.reshape(-1, 1)
+        return X
+    elif isinstance(X, torch.Tensor):
+        if X.dim() == 1:
+            return X.unsqueeze(1)
+        return X
+    elif isinstance(X, list):
+        # Check depth of list
+        if not X:
+            return X
+        if not isinstance(X[0], list):
+            return [[x] for x in X]
+        return X
+    else:
+        msg = f"X must be of type pandas.DataFrame, pd.Series, numpy.ndarray, list, or torch.Tensor, but got {type(X)}"
+        raise TypeError(msg)
+def flatten_1d(
+    y: pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor,
+) -> pd.Series | np.ndarray | list | torch.Tensor:
+    """
+    Flattens input to a 1D structure.
+    Args:
+        y (pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor): Input data.
+    Returns:
+        pd.Series | np.ndarray | list | torch.Tensor: 1D representation of the input.
+    Notes:
+        Inputs with multiple columns (e.g., DataFrame with >1 column) are flattened into a single 1D structure.
+    Raises:
+        TypeError: If input type is not supported.
+    """
+    if isinstance(y, pd.DataFrame):
+        if y.shape[1] == 1:
+            return y.iloc[:, 0]
+        else:
+            return pd.Series(y.to_numpy().flatten())
+    elif isinstance(y, np.ndarray):
+        return y.flatten()
+    elif isinstance(y, torch.Tensor):
+        return y.view(-1)
+    elif isinstance(y, list):
+        # Recursively flatten list if needed, or simple comprehension if just 2D
+        if not y:
+            return y
+        if isinstance(y[0], list):
+            return [item for sublist in y for item in sublist]
+        return y
+    else:
+        msg = f"Input must be of type pandas.DataFrame, pandas.Series, numpy.ndarray, list, or torch.Tensor, but got {type(y)}"
+        raise TypeError(msg)
+def safe_shape(
+    X: pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor,
+) -> tuple[int, ...]:
+    """Returns the shape of the input container as a tuple.
+    Args:
+        X (pd.DataFrame | pd.Series | np.ndarray | list | torch.Tensor): Input data.
+    Returns:
+        tuple[int, ...]: Dimensions of the data (rows, cols, etc.).
+    """
+    if isinstance(X, (pd.DataFrame, np.ndarray)):
+        return X.shape
+    elif isinstance(X, pd.Series):
+        return (X.shape[0],)
+    elif isinstance(X, torch.Tensor):
+        return tuple(X.shape)
+    elif isinstance(X, list):
+        if not X:
+            return (0,)
+        rows = len(X)
+        # Check if 2D list
+        if isinstance(X[0], list):
+            return (rows, len(X[0]))
+        return (rows,)
+    else:
+        msg = f"X must be of type pandas.DataFrame, pd.Series, numpy.ndarray, list, or torch.Tensor, but got {type(X)}"
+        raise TypeError(msg)

pgsui/utils/plotting.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 import warnings
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Sequence, cast
+from typing import Dict, List, Literal, Optional, Sequence, Mapping, cast
 import matplotlib as mpl
@@ -294,6 +294,10 @@ class Plotting:
             ValueError: If model_name is not recognized (legacy guard).
         """
         num_classes = y_pred_proba.shape[1]
+        if num_classes < 2:
+            msg = "plot_metrics: num_classes must be >= 2 for ROC/PR curves."
+            self.logger.error(msg)
+            raise ValueError(msg)
         # Validate/normalize label names
         if label_names is not None and len(label_names) != num_classes:
@@ -391,7 +395,7 @@ class Plotting:
             ncol=2,
         )
-        # PR
+        # Precision-recall
         axes[1].plot(
             recall["micro"],
             precision["micro"],
@@ -433,7 +437,9 @@ class Plotting:
         )
         fig.savefig(self.output_dir / out_name, bbox_inches="tight")
         if self.show_plots:
-            plt.show()
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", UserWarning)
+                plt.show()
         plt.close(fig)
         # ---- MultiQC: metrics table + per-class AUC/AP heatmap ------------
@@ -465,73 +471,70 @@ class Plotting:
             except Exception as exc:  # pragma: no cover - defensive
                 self.logger.warning(f"Failed to queue MultiQC ROC/PR curves: {exc}")
-    def plot_history(
-        self,
-        history: Dict[str, List[float] | Dict[str, List[float]] | None] | None,
-    ) -> None:
+    def _series_from_history(self, vals: list[float]) -> pd.Series:
+        """Convert to float series and coerce non-finite to NaN."""
+        s = pd.Series(vals, dtype="float64")
+        s[~np.isfinite(s.to_numpy())] = np.nan
+        return s
+    def _interp_sparse(self, s: pd.Series) -> pd.Series:
+        """Interpolate internal gaps; keep leading/trailing NaNs."""
+        # Only interpolate if we have enough points to make it meaningful
+        if s.notna().sum() < 2:
+            return s
+        return s.interpolate(method="linear", limit_area="inside")
+    def plot_history(self, history: dict[str, list[float]]) -> None:
         """Plot model history traces. Will be saved to file.
         This method plots the deep learning model history traces. The plot is saved to disk as a ``<plot_format>`` file.
         Args:
-            history (Dict[str, List[float]]): Dictionary with lists of history objects. Keys should be "Train" and "Validation".
+            history (dict[str, list[float]]): Dictionary with lists of history objects. Keys should be "Train" and "Validation".
         Raises:
-            ValueError: nn_method must be either 'ImputeNLPCA', 'ImputeUBP', 'ImputeAutoencoder', 'ImputeVAE'.
+            ValueError: self.model_name must be either 'ImputeAutoencoder' or 'ImputeVAE'.
         """
-        if self.model_name not in {
-            "ImputeNLPCA",
-            "ImputeVAE",
-            "ImputeAutoencoder",
-            "ImputeUBP",
-        }:
-            msg = "nn_method must be either 'ImputeNLPCA', 'ImputeVAE', 'ImputeAutoencoder', 'ImputeUBP'."
+        if self.model_name not in {"ImputeVAE", "ImputeAutoencoder"}:
+            msg = f"model_name must be 'ImputeVAE' or 'ImputeAutoencoder', but got: {self.model_name}."
             self.logger.error(msg)
             raise ValueError(msg)
-        if self.model_name != "ImputeUBP":
-            fig, ax = plt.subplots(1, 1, figsize=(12, 8))
-            df = pd.DataFrame(history)
-            df = df.iloc[1:]
-            # Plot train accuracy
-            ax.plot(df["Train"], c="blue", lw=3)
+        if not history:
+            msg = "history object passed to plot_history is empty."
+            self.logger.error(msg)
+            raise ValueError(msg)
-            ax.set_title(f"{self.model_name} Loss per Epoch")
-            ax.set_ylabel("Loss")
-            ax.set_xlabel("Epoch")
-            ax.legend(["Train"], loc="best", shadow=True, fancybox=True)
+        if (
+            not isinstance(history, dict)
+            or "Train" not in history
+            or "Val" not in history
+        ):
+            msg = "history must be of type dict and contain 'Train' and 'Val' keys."
+            self.logger.error(msg)
+            raise TypeError(msg)
-        else:
-            fig, ax = plt.subplots(3, 1, figsize=(12, 8))
+        fig, ax = plt.subplots(1, 1, figsize=(12, 8))
-            # Ensure history is the nested dictionary type for ImputeUBP
-            if not (
-                isinstance(history, dict)
-                and "Train" in history
-                and isinstance(history["Train"], dict)
-            ):
-                msg = "For ImputeUBP, history must be a nested dictionary with phases."
-                self.logger.error(msg)
-                raise TypeError(msg)
+        train = self._series_from_history(history["Train"]).iloc[1:]
+        val = self._series_from_history(history["Val"]).iloc[1:]
-            for i, phase in enumerate(range(1, 4)):
-                train = pd.Series(history["Train"][f"Phase {phase}"])
-                train = train.iloc[1:]  # ignore first epoch
+        ax.plot(train.index, train.to_numpy(), c="blue", lw=3, linestyle="-")
+        ax.plot(val.index, val.to_numpy(), c="orange", lw=3, linestyle="-")
-                # Plot train accuracy
-                ax[i].plot(train, c="blue", lw=3)
-                ax[i].set_title(f"{self.model_name}: Phase {phase} Loss per Epoch")
-                ax[i].set_ylabel("Loss")
-                ax[i].set_xlabel("Epoch")
-                ax[i].legend([f"Phase {phase}"], loc="best", shadow=True, fancybox=True)
+        ax.set_title(f"{self.model_name} Loss per Epoch")
+        ax.set_ylabel("Loss")
+        ax.set_xlabel("Epoch")
+        ax.legend(["Train", "Validation"], loc="best", shadow=True, fancybox=True)
         fn = f"{self.model_name.lower()}_history_plot.{self.plot_format}"
         fn = self.output_dir / fn
         fig.savefig(fn)
         if self.show_plots:
-            plt.show()
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", UserWarning)
+                plt.show()
         plt.close(fig)
         # ---- MultiQC: training-loss vs epoch linegraphs -------------------
@@ -606,7 +609,7 @@ class Plotting:
         panel_suffix = f"{prefix}_" if prefix else ""
         panel_id = f"{self.model_name.lower()}_{panel_suffix}confusion_matrix"
-        if prefix != "":
+        if prefix != "" and not prefix.endswith("_"):
             prefix = f"{prefix}_"
         out_name = (
@@ -614,7 +617,9 @@ class Plotting:
         )
         fig.savefig(self.output_dir / out_name, bbox_inches="tight")
         if self.show_plots:
-            plt.show()
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", UserWarning)
+                plt.show()
         plt.close(fig)
         # ---- MultiQC: confusion-matrix heatmap ----------------------------
@@ -715,7 +720,9 @@ class Plotting:
         fig.savefig(fn, dpi=300)
         if self.show_plots:
-            plt.show()
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", UserWarning)
+                plt.show()
         plt.close(fig)
         # ---- MultiQC: genotype-distribution barplot -----------------------
@@ -763,19 +770,19 @@ class Plotting:
         if df_trials.empty or "value" not in df_trials:
             return
-        data: Dict[str, Dict[int, int]] = {
+        history_data: Dict[str, Dict[int, float]] = {
             model_name: {
-                row["number"]: row["value"]
+                int(row["number"]): float(row["value"])
                 for _, row in df_trials.iterrows()
                 if row["value"] is not None
             }
         }
-        if not data[model_name]:
+        if not history_data[model_name]:
             return
         SNPioMultiQC.queue_linegraph(
-            data=data,
+            data=cast(Dict[str, Dict[int, int]], history_data),
             panel_id=f"{self.model_name}_optuna_history",
             section=self.multiqc_section,
             title=f"{self.model_name} Optuna Optimization History",
@@ -792,8 +799,16 @@ class Plotting:
             return
         if best_params:
-            series = pd.Series(best_params, name="Best Value")
-            series["objective"] = best_value
+            # Build a single dict so static type checkers don't infer a
+            # mismatched dtype for the Series and complain about assigning
+            # a float value after creation.
+            best_param_data: Dict[str, float | int | str] = {
+                **{str(k): cast(float | int | str, v) for k, v in best_params.items()},
+                "objective": float(best_value),
+            }
+            series = pd.Series(best_param_data, name="Best Value")
             SNPioMultiQC.queue_table(
                 df=series,
                 panel_id=f"{self.model_name}_optuna_best_params",
@@ -992,7 +1007,7 @@ class Plotting:
     def _queue_multiqc_history(
         self,
         *,
-        history: Dict[str, List[float] | Dict[str, List[float]] | None] | None,
+        history: Mapping[str, List[float] | Dict[str, List[float]] | None] | None,
     ) -> None:
         """Queue training history (loss vs epoch) for MultiQC.

pgsui/utils/pretty_metrics.py CHANGED Viewed

@@ -1,12 +1,10 @@
 from __future__ import annotations
+import json
 import math
 from typing import Any, Iterable, List, Mapping, Optional, Sequence, Tuple
-try:
-    import numpy as np
-except Exception:
-    np = None  # type: ignore
+import numpy as np
 # Optional Rich console; falls back to ASCII if not installed.
 try:
@@ -152,8 +150,6 @@ class PrettyMetrics:
         Returns:
             str: Compact JSON representation, suitable for logging artifacts.
         """
-        import json
         return json.dumps(self.metrics, separators=(",", ":"), ensure_ascii=False)
     # ----------------------- Internal helpers -----------------------------

pgsui/utils/scorers.py CHANGED Viewed

@@ -5,6 +5,8 @@ from sklearn.metrics import (
     accuracy_score,
     average_precision_score,
     f1_score,
+    jaccard_score,
+    matthews_corrcoef,
     precision_score,
     recall_score,
     roc_auc_score,
@@ -164,6 +166,8 @@ class Scorer:
             "f1",
             "precision",
             "recall",
+            "mcc",
+            "jaccard",
         ] = "pr_macro",
     ) -> Dict[str, float] | None:
         """Evaluate the model using various metrics.
@@ -228,6 +232,10 @@ class Scorer:
                 metrics = {"precision": self.precision(y_true, y_pred)}
             elif tune_metric == "recall":
                 metrics = {"recall": self.recall(y_true, y_pred)}
+            elif tune_metric == "jaccard":
+                metrics = {"jaccard": self.jaccard(y_true, y_pred)}
+            elif tune_metric == "mcc":
+                metrics = {"mcc": self.mcc(y_true, y_pred)}
             else:
                 msg = f"Invalid tune_metric provided: '{tune_metric}'."
                 self.logger.error(msg)
@@ -241,10 +249,41 @@ class Scorer:
                 "roc_auc": self.roc_auc(y_true, y_pred_proba),
                 "average_precision": self.average_precision(y_true, y_pred_proba),
                 "pr_macro": self.pr_macro(y_true_ohe, y_pred_proba),
+                "jaccard": self.jaccard(np.asarray(y_true), np.asarray(y_pred)),
+                "mcc": self.mcc(np.asarray(y_true), np.asarray(y_pred)),
             }
         return {k: float(v) for k, v in metrics.items()}
+    def jaccard(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        """Compute the Jaccard similarity coefficient.
+        The Jaccard similarity coefficient, also known as Intersection over Union (IoU), measures the similarity between two sets. It is defined as the size of the intersection divided by the size of the union of the sample sets.
+        Args:
+            y_true (np.ndarray): Ground truth (correct) target values.
+            y_pred (np.ndarray): Predicted target values.
+        Returns:
+            float: Jaccard similarity coefficient.
+        """
+        avg: str = self.average
+        return float(jaccard_score(y_true, y_pred, average=avg, zero_division=0))
+    def mcc(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        """Compute the Matthews correlation coefficient (MCC).
+        MCC is a balanced measure that can be used even if the classes are of very different sizes. It returns a value between -1 and +1, where +1 indicates a perfect prediction, 0 indicates no better than random prediction, and -1 indicates total disagreement between prediction and observation.
+        Args:
+            y_true (np.ndarray): Ground truth (correct) target values.
+            y_pred (np.ndarray): Predicted target values.
+        Returns:
+            float: Matthews correlation coefficient.
+        """
+        return float(matthews_corrcoef(y_true, y_pred))
     def average_precision(self, y_true: np.ndarray, y_pred_proba: np.ndarray) -> float:
         """Average precision with safe multiclass handling.

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl