PyPI - pg-sui - Versions diffs - 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl - Mend

pg-sui 0.2.0py3-none-any.whl → 1.6.14.dev9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

{pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
{pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
{pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
pgsui/__init__.py +35 -54
pgsui/_version.py +34 -0
pgsui/cli.py +909 -0
pgsui/data_processing/__init__.py +0 -0
pgsui/data_processing/config.py +565 -0
pgsui/data_processing/containers.py +1424 -0
pgsui/data_processing/transformers.py +557 -907
pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
pgsui/electron/app/__main__.py +5 -0
pgsui/electron/app/extra-resources/.gitkeep +1 -0
pgsui/electron/app/icons/icons/1024x1024.png +0 -0
pgsui/electron/app/icons/icons/128x128.png +0 -0
pgsui/electron/app/icons/icons/16x16.png +0 -0
pgsui/electron/app/icons/icons/24x24.png +0 -0
pgsui/electron/app/icons/icons/256x256.png +0 -0
pgsui/electron/app/icons/icons/32x32.png +0 -0
pgsui/electron/app/icons/icons/48x48.png +0 -0
pgsui/electron/app/icons/icons/512x512.png +0 -0
pgsui/electron/app/icons/icons/64x64.png +0 -0
pgsui/electron/app/icons/icons/icon.icns +0 -0
pgsui/electron/app/icons/icons/icon.ico +0 -0
pgsui/electron/app/main.js +227 -0
pgsui/electron/app/package-lock.json +6894 -0
pgsui/electron/app/package.json +51 -0
pgsui/electron/app/preload.js +15 -0
pgsui/electron/app/server.py +157 -0
pgsui/electron/app/ui/logo.png +0 -0
pgsui/electron/app/ui/renderer.js +131 -0
pgsui/electron/app/ui/styles.css +59 -0
pgsui/electron/app/ui/ui_shim.js +72 -0
pgsui/electron/bootstrap.py +43 -0
pgsui/electron/launch.py +57 -0
pgsui/electron/package.json +14 -0
pgsui/example_data/__init__.py +0 -0
pgsui/example_data/phylip_files/__init__.py +0 -0
pgsui/example_data/phylip_files/test.phy +0 -0
pgsui/example_data/popmaps/__init__.py +0 -0
pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
pgsui/example_data/structure_files/__init__.py +0 -0
pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
pgsui/impute/__init__.py +0 -0
pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
pgsui/impute/deterministic/imputers/mode.py +844 -0
pgsui/impute/deterministic/imputers/nmf.py +221 -0
pgsui/impute/deterministic/imputers/phylo.py +973 -0
pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
pgsui/impute/supervised/__init__.py +0 -0
pgsui/impute/supervised/base.py +343 -0
pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
pgsui/impute/supervised/imputers/random_forest.py +291 -0
pgsui/impute/unsupervised/__init__.py +0 -0
pgsui/impute/unsupervised/base.py +1118 -0
pgsui/impute/unsupervised/callbacks.py +92 -262
{simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
pgsui/impute/unsupervised/imputers/vae.py +1228 -0
pgsui/impute/unsupervised/loss_functions.py +261 -0
pgsui/impute/unsupervised/models/__init__.py +0 -0
pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
pgsui/impute/unsupervised/models/vae_model.py +269 -630
pgsui/impute/unsupervised/nn_scorers.py +255 -0
pgsui/utils/__init__.py +0 -0
pgsui/utils/classification_viz.py +608 -0
pgsui/utils/logging_utils.py +22 -0
pgsui/utils/misc.py +35 -480
pgsui/utils/plotting.py +996 -829
pgsui/utils/pretty_metrics.py +290 -0
pgsui/utils/scorers.py +213 -666
pg_sui-0.2.0.dist-info/RECORD +0 -75
pg_sui-0.2.0.dist-info/top_level.txt +0 -3
pgsui/example_data/phylip_files/test_n10.phy +0 -118
pgsui/example_data/phylip_files/test_n100.phy +0 -118
pgsui/example_data/phylip_files/test_n2.phy +0 -118
pgsui/example_data/phylip_files/test_n500.phy +0 -118
pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
pgsui/example_data/trees/test.iqtree +0 -376
pgsui/example_data/trees/test.qmat +0 -5
pgsui/example_data/trees/test.rate +0 -2033
pgsui/example_data/trees/test.tre +0 -1
pgsui/example_data/trees/test_n10.rate +0 -19
pgsui/example_data/trees/test_n100.rate +0 -109
pgsui/example_data/trees/test_n500.rate +0 -509
pgsui/example_data/trees/test_siterates.txt +0 -2024
pgsui/example_data/trees/test_siterates_n10.txt +0 -10
pgsui/example_data/trees/test_siterates_n100.txt +0 -100
pgsui/example_data/trees/test_siterates_n500.txt +0 -500
pgsui/example_data/vcf_files/test.vcf +0 -244
pgsui/example_data/vcf_files/test.vcf.gz +0 -0
pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
pgsui/impute/estimators.py +0 -1268
pgsui/impute/impute.py +0 -1463
pgsui/impute/simple_imputers.py +0 -1431
pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
pgsui/impute/unsupervised/keras_classifiers.py +0 -697
pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
pgsui/pg_sui.py +0 -261
pgsui/utils/sequence_tools.py +0 -407
simulation/sim_benchmarks.py +0 -333
simulation/sim_treeparams.py +0 -475
test/__init__.py +0 -0
test/pg_sui_simtest.py +0 -215
test/pg_sui_testing.py +0 -523
test/test.py +0 -151
test/test_pgsui.py +0 -374
test/test_tkc.py +0 -185

pgsui/data_processing/transformers.py CHANGED Viewed

@@ -1,694 +1,223 @@
+# Standard library imports
 import copy
-import os
 import logging
-import sys
-import warnings
-import numpy as np
-import pandas as pd
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal, Optional, Tuple
 # Third-party imports
 import numpy as np
-import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import (
-    roc_auc_score,
-    precision_recall_fscore_support,
-    average_precision_score,
-)
-from sklearn.preprocessing import label_binarize
-# Import tensorflow with reduced warnings.
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-logging.getLogger("tensorflow").disabled = True
-warnings.filterwarnings("ignore", category=UserWarning)
-# noinspection PyPackageRequirements
-import tensorflow as tf
-# Disable can't find cuda .dll errors. Also turns of GPU support.
-tf.config.set_visible_devices([], "GPU")
-from tensorflow.python.util import deprecation
-# Disable warnings and info logs.
-tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
-tf.get_logger().setLevel(logging.ERROR)
-# Monkey patching deprecation utils to supress warnings.
-# noinspection PyUnusedLocal
-def deprecated(
-    date, instructions, warn_once=True
-):  # pylint: disable=unused-argument
-    def deprecated_wrapper(func):
-        return func
-    return deprecated_wrapper
-deprecation.deprecated = deprecated
-# Custom Modules
-try:
-    from ..utils import misc
-except (ModuleNotFoundError, ValueError, ImportError):
-    from pgsui.utils import misc
-# Pandas on pip gives a performance warning when doing the below code.
-# Apparently it's a bug that exists in the pandas version I used here.
-# It can be safely ignored.
-warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
-def encode_onehot(X):
-    """Convert 012-encoded data to one-hot encodings.
-    Args:
-        X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
-    Returns:
-        pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan).
-    """
-    Xt = np.zeros(shape=(X.shape[0], X.shape[1], 3))
-    mappings = {
-        0: np.array([1, 0, 0]),
-        1: np.array([0, 1, 0]),
-        2: np.array([0, 0, 1]),
-        -9: np.array([np.nan, np.nan, np.nan]),
-    }
-    for row in np.arange(X.shape[0]):
-        Xt[row] = [mappings[enc] for enc in X[row]]
-    return Xt
+from pgsui.utils.misc import validate_input_type
-def mle(row):
-    """Get the Maximum Likelihood Estimation for the best prediction. Basically, it sets the index of the maxiumum value in a vector (row) to 1.0, since it is one-hot encoded.
+if TYPE_CHECKING:
+    from snpio import TreeParser
-    Args:
-        row (numpy.ndarray(float)): Row vector with predicted values as floating points.
-    Returns:
-        numpy.ndarray(float): Row vector with the highest prediction set to 1.0 and the others set to 0.0.
-    """
-    res = np.zeros(row.shape[0])
-    res[np.argmax(row)] = 1
-    return res
+class SimGenotypeDataTransformer:
+    """Simulates missing genotypes at the locus level on a 2D integer matrix.
-class UBPInputTransformer(BaseEstimator, TransformerMixin):
-    """Transform input X prior to estimator fitting.
+    This transformer masks a proportion of known genotypes in the input matrix X, setting them to a specified missing value. The masking can be done randomly or based on inverse genotype frequencies, with an option to boost the likelihood of masking heterozygous genotypes.
     Args:
-        n_components (int): Number of principal components currently being used in V.
-        V (numpy.ndarray or Dict[str, Any]): If doing grid search, should be a dictionary with current_component: numpy.ndarray. If not doing grid search, then it should be a numpy.ndarray.
+        prop_missing (float): Proportion of *known* loci to mask (0..1).
+        strategy (Literal): Strategy name.
+        missing_val (int): Missing code value (default: -9).
+        seed (int | None): RNG seed.
+        logger (logging.Logger | None): Logger for messages.
+        het_boost (float): Multiplier for heterozygotes in inv-genotype mode.
     """
-    def __init__(self, n_components, V):
-        self.n_components = n_components
-        self.V = V
-    def fit(self, X):
-        """Fit transformer to input data X.
-        Args:
-            X (numpy.ndarray): Input data to fit. If numpy.ndarray, then should be of shape (n_samples, n_components). If dictionary, then should be component: numpy.ndarray.
-        Returns:
-            self: Class instance.
-        """
-        self.n_features_in_ = self.n_components
-        return self
-    def transform(self, X):
-        """Transform input data X to the needed format.
-        Args:
-            X (numpy.ndarray): Input data to fit. If numpy.ndarray, then should be of shape (n_samples, n_components). If dictionary, then should be component: numpy.ndarray.
-        Returns:
-            numpy.ndarray: Formatted input data with correct component.
-        Raises:
-            TypeError: V must be a dictionary if phase is None or phase == 1.
-            TypeError: V must be a numpy array if phase is 2 or 3.
-        """
-        if not isinstance(self.V, dict):
-            raise TypeError(f"V must be a dictionary, but got {type(self.V)}")
-        return self.V[self.n_components]
-class AutoEncoderFeatureTransformer(BaseEstimator, TransformerMixin):
-    """Transformer to format autoencoder features and targets before model fitting.
-    The input data, X, is encoded to one-hot format, and then missing values are filled to [-1] * num_classes.
-    Missing and observed boolean masks are also generated.
-    Args:
-        num_classes (int, optional): The number of classes in the last axis dimention of the input array. Defaults to 3.
-        return_int (bool, optional): Whether to return an integer-encoded array (If True) or a one-hot or multi-label encoded array (If False.). Defaults to False.
-        activate (str or None, optional): If not None, then does the appropriate activation. Multilabel learning uses sigmoid activation, and multiclass uses softmax. If set to None, then the function assumes that the input has already been activated. Possible values include: {None, 'sigmoid', 'softmax'}. Defaults to None.
-    """
-    def __init__(self, num_classes=3, return_int=False, activate=None):
-        self.num_classes = num_classes
-        self.return_int = return_int
-        self.activate = activate
-    def fit(self, X, y=None):
-        """set attributes used to transform X (input features).
-        Args:
-            X (numpy.ndarray): Input integer-encoded numpy array.
-            y (None): Just for compatibility with sklearn API.
-        """
-        X = misc.validate_input_type(X, return_type="array")
-        self.X_decoded = X
-        # VAE uses 4 classes ([A,T,G,C]), SAE uses 3 ([0,1,2]).
-        if self.num_classes == 3:
-            enc_func = self.encode_012
-        elif self.num_classes == 4:
-            enc_func = self.encode_multilab
-        elif self.num_classes == 10:
-            enc_func = self.encode_multiclass
-        else:
-            raise ValueError(
-                f"Invalid value passed to num_classes in "
-                f"AutoEncoderFeatureTransformer. Only 3 or 4 are supported, "
-                f"but got {self.num_classes}."
-            )
-        # Encode the data.
-        self.X_train = enc_func(X)
-        self.classes_ = np.arange(self.num_classes)
-        self.n_classes_ = self.num_classes
-        # Get missing and observed data boolean masks.
-        self.missing_mask_, self.observed_mask_ = self._get_masks(self.X_train)
-        # To accomodate multiclass-multioutput.
-        self.n_outputs_expected_ = 1
-        self.n_outputs_ = self.X_train.shape[1]
-        return self
-    def transform(self, X):
-        """Transform X to one-hot encoded format.
-        Accomodates multiclass targets with a 3D shape.
-        Args:
-            X (numpy.ndarray): One-hot encoded target data of shape (n_samples, n_features, num_classes).
-        Returns:
-            numpy.ndarray: Transformed target data in one-hot format of shape (n_samples, n_features, num_classes).
-        """
-        if self.return_int:
-            return X
-        else:
-            # X = misc.validate_input_type(X, return_type="array")
-            return self._fill(self.X_train, self.missing_mask_)
-    def inverse_transform(self, y, return_proba=False):
-        """Transform target to output format.
-        Args:
-            y (numpy.ndarray): Array to inverse transform.
-            return_proba (bool): Just for compatibility with scikeras API.
-        """
-        try:
-            if self.activate is None:
-                y = y.numpy()
-            elif self.activate == "softmax":
-                y = tf.nn.softmax(y).numpy()
-            elif self.activate == "sigmoid":
-                y = tf.nn.sigmoid(y).numpy()
-            else:
-                raise ValueError(
-                    f"Invalid value passed to keyword argument activate. Valid "
-                    f"options include: None, 'softmax', or 'sigmoid', but got "
-                    f"{self.activate}"
-                )
-        except AttributeError:
-            # If numpy array already.
-            if self.activate is None:
-                y = y.copy()
-            elif self.activate == "softmax":
-                y = tf.nn.softmax(tf.convert_to_tensor(y)).numpy()
-            elif self.activate == "sigmoid":
-                y = tf.nn.sigmoid(tf.convert_to_tensor(y)).numpy()
-            else:
-                raise ValueError(
-                    f"Invalid value passed to keyword argument activate. Valid "
-                    f"options include: None, 'softmax', or 'sigmoid', but got "
-                    f"{self.activate}"
-                )
-        return y
-    def encode_012(self, X):
-        """Convert 012-encoded data to one-hot encodings.
-        Args:
-            X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
-        Returns:
-            pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan).
-        """
-        Xt = np.zeros(shape=(X.shape[0], X.shape[1], 3))
-        mappings = {
-            0: np.array([1, 0, 0]),
-            1: np.array([0, 1, 0]),
-            2: np.array([0, 0, 1]),
-            -9: np.array([np.nan, np.nan, np.nan]),
-        }
-        for row in np.arange(X.shape[0]):
-            Xt[row] = [mappings[enc] for enc in X[row]]
-        return Xt
-    def encode_multilab(self, X, multilab_value=1.0):
-        """Encode 0-9 integer data in multi-label one-hot format.
-        Args:
-            X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
-            multilab_value (float): Value to use for multilabel target encodings. Defaults to 0.5.
-        Returns:
-            pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan). multi-label categories will be encoded as 0.5. Otherwise, it will be 1.0.
-        """
-        Xt = np.zeros(shape=(X.shape[0], X.shape[1], 4))
-        mappings = {
-            0: [1.0, 0.0, 0.0, 0.0],
-            1: [0.0, 1.0, 0.0, 0.0],
-            2: [0.0, 0.0, 1.0, 0.0],
-            3: [0.0, 0.0, 0.0, 1.0],
-            4: [multilab_value, multilab_value, 0.0, 0.0],
-            5: [multilab_value, 0.0, multilab_value, 0.0],
-            6: [multilab_value, 0.0, 0.0, multilab_value],
-            7: [0.0, multilab_value, multilab_value, 0.0],
-            8: [0.0, multilab_value, 0.0, multilab_value],
-            9: [0.0, 0.0, multilab_value, multilab_value],
-            -9: [np.nan, np.nan, np.nan, np.nan],
-        }
-        for row in np.arange(X.shape[0]):
-            Xt[row] = [mappings[enc] for enc in X[row]]
-        return Xt
-    def decode_multilab(self, X, multilab_value=1.0):
-        """Decode one-hot format data back to 0-9 integer data.
-        Args:
-            X (numpy.ndarray): Input array with one-hot-encoded data.
-            multilab_value (float): Value to use for multilabel target encodings. Defaults to 0.5.
-        Returns:
-            pandas.DataFrame: Decoded data, with multi-label categories decoded to their original integer representation.
-        """
-        Xt = np.zeros(shape=(X.shape[0], X.shape[1]))
-        mappings = {
-            tuple([1.0, 0.0, 0.0, 0.0]): 0,
-            tuple([0.0, 1.0, 0.0, 0.0]): 1,
-            tuple([0.0, 0.0, 1.0, 0.0]): 2,
-            tuple([0.0, 0.0, 0.0, 1.0]): 3,
-            tuple([multilab_value, multilab_value, 0.0, 0.0]): 4,
-            tuple([multilab_value, 0.0, multilab_value, 0.0]): 5,
-            tuple([multilab_value, 0.0, 0.0, multilab_value]): 6,
-            tuple([0.0, multilab_value, multilab_value, 0.0]): 7,
-            tuple([0.0, multilab_value, 0.0, multilab_value]): 8,
-            tuple([0.0, 0.0, multilab_value, multilab_value]): 9,
-            tuple([np.nan, np.nan, np.nan, np.nan]): -9,
-        }
-        for row in np.arange(X.shape[0]):
-            Xt[row] = [mappings[tuple(enc)] for enc in X[row]]
-        return Xt
-    def encode_multiclass(self, X, num_classes=10, missing_value=-9):
-        """Encode 0-9 integer data in multi-class one-hot format.
-        Missing values get encoded as ``[np.nan] * num_classes``
-        Args:
-            X (numpy.ndarray): Input array with 012-encoded data and ``missing_value`` as the missing data value.
-            num_classes (int, optional): Number of classes to use. Defaults to 10.
-            missing_value (int, optional): Missing data value to replace with ``[np.nan] * num_classes``\. Defaults to -9.
-        Returns:
-            pandas.DataFrame: Multi-class one-hot encoded data, ignoring missing values (np.nan).
-        """
-        int_cats, ohe_arr = np.arange(num_classes), np.eye(num_classes)
-        mappings = dict(zip(int_cats, ohe_arr))
-        mappings[missing_value] = np.array([np.nan] * num_classes)
-        Xt = np.zeros(shape=(X.shape[0], X.shape[1], num_classes))
-        for row in np.arange(X.shape[0]):
-            Xt[row] = [mappings[enc] for enc in X[row]]
-        return Xt
-    def _fill(self, data, missing_mask, missing_value=-1):
-        """Mask missing data as ``missing_value``\.
-        Args:
-            data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
-            missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value.
-            missing_value (int): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes.
-        """
-        if self.num_classes > 1:
-            missing_value = [missing_value] * self.num_classes
-        data[missing_mask] = missing_value
-        return data
-    def _get_masks(self, X):
-        """Format the provided target data for use with UBP/NLPCA.
-        Args:
-            y (numpy.ndarray(float)): Input data that will be used as the target of shape (n_samples, n_features, num_classes).
-        Returns:
-            numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
-            numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
-        """
-        missing_mask = self._create_missing_mask(X)
-        observed_mask = ~missing_mask
-        return missing_mask, observed_mask
-    def _create_missing_mask(self, data):
-        """Creates a missing data mask with boolean values.
-        Args:
-            data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
-        Returns:
-            numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
-        """
-        return np.isnan(data).all(axis=2)
-class MLPTargetTransformer(BaseEstimator, TransformerMixin):
-    """Transformer to format UBP / NLPCA target data both before and after model fitting."""
+    def __init__(
+        self,
+        *,
+        prop_missing: float = 0.1,
+        strategy: Literal["random", "random_inv_genotype"] = "random",
+        missing_val: int = -1,
+        seed: int | None = None,
+        logger: logging.Logger | None = None,
+        het_boost: float = 1.0,
+    ):
+        self.prop_missing = float(prop_missing)
+        self.strategy = strategy
+        self.missing_val = int(missing_val)
+        self.seed = seed
+        self.rng = np.random.default_rng(seed)
+        self.het_boost = float(het_boost)
+        self.logger = logger or logging.getLogger(__name__)
-    def fit(self, y):
-        """Fit 012-encoded target data.
+    def fit(self, X, y=None) -> "SimGenotypeDataTransformer":
+        """Stateless.
         Args:
-            y (numpy.ndarray): Target data that is 012-encoded.
-        Returns:
-            self: Class instance.
+            X (np.ndarray): (n_samples, n_features), integer codes {0..9} or <0 as missing.
+            y: Ignored.
         """
-        y = misc.validate_input_type(y, return_type="array")
-        # Original 012-encoded y
-        self.y_decoded_ = y
-        y_train = encode_onehot(y)
-        # Get missing and observed data boolean masks.
-        self.missing_mask_, self.observed_mask_ = self._get_masks(y_train)
-        # To accomodate multiclass-multioutput.
-        self.n_outputs_expected_ = 1
         return self
-    def transform(self, y):
-        """Transform y_true to one-hot encoded.
-        Accomodates multiclass-multioutput targets.
-        Args:
-            y (numpy.ndarray): One-hot encoded target data.
-        Returns:
-            numpy.ndarray: y_true target data.
-        """
-        y = misc.validate_input_type(y, return_type="array")
-        y_train = encode_onehot(y)
-        return self._fill(y_train, self.missing_mask_)
-    def inverse_transform(self, y):
-        """Decode y_pred from one-hot to 012-based encoding.
-        This allows sklearn.metrics to be used.
+    def transform(self, X: np.ndarray) -> tuple[np.ndarray, dict]:
+        """Apply missing-data simulation on a 2D genotype matrix.
         Args:
-            y (numpy.ndarray): One-hot encoded predicted probabilities after model fitting.
+            X (np.ndarray): (n_samples, n_features), integer codes {0..9} or <0 as missing.
         Returns:
-            numpy.ndarray: y predictions in same format as y_true.
+            tuple[np.ndarray, dict]: (X_masked, masks) where masks has keys: 'original': original missing (boolean 2D). 'simulated': loci masked here (boolean 2D). 'all': union of original + simulated (boolean 2D)
         """
-        # VAE has tuple output
-        if isinstance(y, tuple):
-            y = y[0]
+        if X.ndim != 2:
+            msg = f"X must be 2D, got shape {X.shape}"
+            self.logger.error(msg)
+            raise ValueError(msg)
-        # Return predictions.
-        return tf.nn.softmax(y).numpy()
+        X = np.asarray(X)
+        original_mask = X < 0
-    def _fill(self, data, missing_mask, missing_value=-1, num_classes=3):
-        """Mask missing data as ``missing_value``\.
+        sim_mask = self._simulate_missing_mask(X, original_mask)
+        sim_mask = sim_mask & (~original_mask)
+        sim_mask = self._validate_mask(sim_mask)
-        Args:
-            data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
-            missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value.
-            missing_value (int): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes. Defaults to -1.
-            num_classes (int): Number of classes in dataset. Defaults to 3.
-        """
-        if num_classes > 1:
-            missing_value = [missing_value] * num_classes
-        data[missing_mask] = missing_value
-        return data
-    def _get_masks(self, X):
-        """Format the provided target data for use with UBP/NLPCA.
-        Args:
-            X (numpy.ndarray(float)): Input data that will be used as the target.
-        Returns:
-            numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
-            numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
-        """
-        missing_mask = self._create_missing_mask(X)
-        observed_mask = ~missing_mask
-        return missing_mask, observed_mask
-    def _create_missing_mask(self, data):
-        """Creates a missing data mask with boolean values.
-        Args:
-            data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
-        Returns:
-            numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
-        """
-        return np.isnan(data).all(axis=2)
-    def _decode(self, y):
-        """Evaluate UBP / NLPCA predictions by calculating the highest predicted value.
-        Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
-        Args:
-            y (numpy.ndarray): Input one-hot encoded data.
+        all_mask = original_mask | sim_mask
+        Xt = X.copy()
+        Xt[all_mask] = self.missing_val
-        Returns:
-            numpy.ndarray: Imputed one-hot encoded values.
-        """
-        Xprob = y
-        Xt = np.apply_along_axis(mle, axis=2, arr=Xprob)
-        Xpred = np.argmax(Xt, axis=2)
-        Xtrue = np.argmax(y, axis=2)
-        Xdecoded = np.zeros((Xpred.shape[0], Xpred.shape[1]))
-        for idx in np.arange(Xdecoded):
-            imputed_idx = np.where(self.observed_mask_[idx] == 0)
-            known_idx = np.nonzero(self.observed_mask_[idx])
-            Xdecoded[idx, imputed_idx] = Xpred[idx, imputed_idx]
-            Xdecoded[idx, known_idx] = Xtrue[idx, known_idx]
-        return Xdecoded.astype("int8")
-class UBPTargetTransformer(BaseEstimator, TransformerMixin):
-    """Transformer to format UBP / NLPCA target data both before model fitting.
-    Examples:
-        >>>ubp_tt = UBPTargetTransformer()
-        >>>y_train = ubp_tt.fit_transform(y)
-    """
+        masks = {"original": original_mask, "simulated": sim_mask, "all": all_mask}
+        return Xt, masks
-    def fit(self, y):
-        """Fit 012-encoded target data.
+    # ---- strategies ----
+    def _simulate_missing_mask(
+        self, X: np.ndarray, original_mask: np.ndarray
+    ) -> np.ndarray:
+        """Simulate missingness mask based on the chosen strategy.
         Args:
-            y (numpy.ndarray): Target data that is 012-encoded, of shape (n_samples, n_features).
+            X (np.ndarray): Input genotype matrix.
+            original_mask (np.ndarray): Boolean mask of original missing values.
         Returns:
-            self: Class instance.
+            np.ndarray: Simulated missing mask.
         """
-        y = misc.validate_input_type(y, return_type="array")
-        # Original 012-encoded y
-        self.y_decoded_ = y
-        # One-hot encode y.
-        y_train = encode_onehot(y)
-        # Get missing and observed data boolean masks.
-        self.missing_mask_, self.observed_mask_ = self._get_masks(y_train)
-        # To accomodate multiclass-multioutput.
-        self.n_outputs_expected_ = 1
-        return self
+        if self.strategy == "random":
+            return self._simulate_random(original_mask)
+        elif self.strategy == "random_inv_genotype":
+            return self._simulate_inv_genotype(X, original_mask)
-    def transform(self, y):
-        """Transform 012-encoded target to one-hot encoded format.
+        msg = "strategy must be one of {'random','random_inv_genotype'}"
+        self.logger.error(msg)
+        raise ValueError(msg)
-        Accomodates multiclass-multioutput targets.
+    def _simulate_random(self, original_mask: np.ndarray) -> np.ndarray:
+        rows, cols = np.where(~original_mask)
+        n_known = len(rows)
+        mask = np.zeros_like(original_mask, dtype=bool)
-        Args:
-            y (numpy.ndarray): One-hot encoded target data of shape (n_samples, n_features).
+        if n_known == 0:
+            return mask
-        Returns:
-            numpy.ndarray: y_true target data.
-        """
-        y = misc.validate_input_type(y, return_type="array")
-        y_train = encode_onehot(y)
-        return self._fill(y_train, self.missing_mask_)
+        n_to_mask = int(np.floor(self.prop_missing * n_known))
-    def inverse_transform(self, y):
-        """Decode y_predicted from one-hot to 012-integer encoding.
+        if n_to_mask <= 0:
+            return mask
-        Performs a softmax activation for multiclass classification.
+        idx = self.rng.choice(n_known, size=n_to_mask, replace=False)
+        mask[rows[idx], cols[idx]] = True
+        return mask
-        This allows sklearn.metrics to be used.
+    def _simulate_inv_genotype(
+        self, X: np.ndarray, original_mask: np.ndarray
+    ) -> np.ndarray:
+        """Simulate missingness mask inversely proportional to genotype frequencies.
         Args:
-            y (numpy.ndarray): One-hot encoded predicted probabilities after model fitting, of shape (n_samples, n_features, num_classes).
+            X (np.ndarray): Input genotype matrix.
+            original_mask (np.ndarray): Boolean mask of original missing values.
         Returns:
-            numpy.ndarray: y predictions in same format as y_true (n_samples, n_features).
+            np.ndarray: Simulated missing mask. 0..3: homozygous (0,1,2,3). 4..9: heterozygous (0/1,0/2,0/3,1/2,1/3,2/3).
         """
-        return tf.nn.softmax(y).numpy()
-    def _fill(self, data, missing_mask, missing_value=-1, num_classes=3):
-        """Mask missing data as ``missing_value``\.
+        rows, cols = np.where(~original_mask)
+        n_known = len(rows)
+        mask = np.zeros_like(original_mask, dtype=bool)
+        if n_known == 0:
+            return mask
-        Args:
-            data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
+        # Global genotype frequencies (0..9) from all known
+        vals = X[~original_mask].astype(int)
+        vals = vals[(vals >= 0) & (vals < 10)]
-            missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value, of shape (n_samples, n_features).
+        if vals.size == 0:
+            return self._simulate_random(original_mask)
-            missing_value (int, optional): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes. Defaults to -1.
+        cnt = np.bincount(vals, minlength=10).astype(float)
+        freqs = cnt / (cnt.sum() + 1e-12)
-            num_classes (int, optional): Number of classes to use. Defaults to 3.
-        """
-        if num_classes > 1:
-            missing_value = [missing_value] * num_classes
-        data[missing_mask] = missing_value
-        return data
+        # Candidate weights
+        geno_known = X[rows, cols].astype(int)  # (n_known,)
+        inv = 1.0 / (freqs[geno_known] + 1e-12)
-    def _get_masks(self, y):
-        """Format the provided target data for use with UBP/NLPCA models.
+        # Optional het boost (heterozygous codes are 4..9)
+        if self.het_boost != 1.0:
+            is_het = (geno_known >= 4) & (geno_known <= 9)
+            inv = inv * np.where(is_het, self.het_boost, 1.0)
-        Args:
-            y (numpy.ndarray(float)): Input data that will be used as the target of shape (n_samples, n_features, num_classes).
+        n_to_mask = int(np.floor(self.prop_missing * n_known))
+        if n_to_mask <= 0:
+            return mask
-        Returns:
-            numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
-            numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
-        """
-        missing_mask = self._create_missing_mask(y)
-        observed_mask = ~missing_mask
-        return missing_mask, observed_mask
-    def _create_missing_mask(self, data):
-        """Creates a missing data mask with boolean values.
-        Args:
-            data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
-        Returns:
-            numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
-        """
-        return np.isnan(data).all(axis=2)
-    def _decode(self, y):
-        """Evaluate UBP/NLPCA predictions by calculating the argmax.
+        probs = inv / (inv.sum() + 1e-12)
+        idx = self.rng.choice(n_known, size=n_to_mask, replace=False, p=probs)
+        mask[rows[idx], cols[idx]] = True
+        return mask
-        Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
+    def _validate_mask(self, mask: np.ndarray) -> np.ndarray:
+        """Avoid fully-masked rows/columns.
         Args:
-            y (numpy.ndarray): Input one-hot encoded data of shape (n_samples, n_features, num_classes).
+            mask (np.ndarray): Input boolean mask.
         Returns:
-            numpy.ndarray: Imputed one-hot encoded values.
-        """
-        Xprob = y
-        Xt = np.apply_along_axis(mle, axis=2, arr=Xprob)
-        Xpred = np.argmax(Xt, axis=2)
-        Xtrue = np.argmax(y, axis=2)
-        Xdecoded = np.zeros((Xpred.shape[0], Xpred.shape[1]))
-        for idx in np.arange(Xdecoded):
-            imputed_idx = np.where(self.observed_mask_[idx] == 0)
-            known_idx = np.nonzero(self.observed_mask_[idx])
-            Xdecoded[idx, imputed_idx] = Xpred[idx, imputed_idx]
-            Xdecoded[idx, known_idx] = Xtrue[idx, known_idx]
-        return Xdecoded.astype("int8")
+            np.ndarray: Validated mask.
+        """
+        rng = self.rng
+        # columns
+        full_cols = np.where(mask.all(axis=0))[0]
+        for c in full_cols:
+            r = int(rng.integers(0, mask.shape[0]))
+            mask[r, c] = False
+        # rows
+        full_rows = np.where(mask.all(axis=1))[0]
+        for r in full_rows:
+            c = int(rng.integers(0, mask.shape[1]))
+            mask[r, c] = False
+        return mask
-class SimGenotypeDataTransformer(BaseEstimator, TransformerMixin):
-    """Simulate missing data on genotypes read/ encoded in a GenotypeData object.
+class SimMissingTransformer(BaseEstimator, TransformerMixin):
+    """Simulate missing data on genotypes encoded as 0/1/2 integers.
-    Copies metadata from a GenotypeData object and simulates user-specified proportion of missing data
+    This transformer is designed to work with genotype data that has been preprocessed into a suitable format. It simulates missing data according to various strategies, allowing for the testing and evaluation of imputation methods. The simulated missing data can be controlled in terms of proportion and distribution across samples and loci.
     Args:
         genotype_data (GenotypeData object): GenotypeData instance.
-        prop_missing (float, optional): Proportion of missing data desired in output. Defaults to 0.1
-        strategy (str, optional): Strategy for simulating missing data. May be one of: "nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", or "random". When set to "nonrandom", branches from GenotypeData.guidetree will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). Defaults to "random"
+        prop_missing (float, optional): Proportion of missing data desired in output. Must be in the interval [0, 1]. Defaults to 0.1
+        strategy (Literal["nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", "random"]): Strategy for simulating missing data. "random": Uniformly masks genotypes at random among eligible entries until the target missing proportion is reached. "random_weighted": Masks genotypes at random with probabilities proportional to their observed genotype frequencies in each column (more common genotypes are more likely to be masked). "random_weighted_inv": Masks genotypes at random with probabilities inversely proportional to their observed genotype frequencies in each column (rarer genotypes are more likely to be masked). "nonrandom": Uses the supplied genotype tree to place missing data on clades that are sampled uniformly from internal and/or tip nodes, producing phylogenetically clustered missingness. "nonrandom_weighted": As in "nonrandom", but clades are sampled with probabilities proportional to their branch lengths, concentrating missingness on longer branches (e.g., mimicking locus dropout tied to evolutionary divergence). Defaults to "random".
         missing_val (int, optional): Value that represents missing data. Defaults to -9.
         mask_missing (bool, optional): True if you want to skip original missing values when simulating new missing data, False otherwise. Defaults to True.
         verbose (bool, optional): Verbosity level. Defaults to 0.
         tol (float): Tolerance to reach proportion specified in self.prop_missing. Defaults to 1/num_snps*num_inds
         max_tries (int): Maximum number of tries to reach targeted missing data proportion within specified tol. If None, num_inds will be used. Defaults to None.
     Attributes:
         original_missing_mask_ (numpy.ndarray): Array with boolean mask for original missing locations.
         simulated_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
         all_missing_mask_ (numpy.ndarray): Array with boolean mask for all missing locations, including both simulated and original.
-    Properties:
-        missing_count (int): Number of genotypes masked by chosen missing data strategy
-        prop_missing_real (float): True proportion of missing data generated using chosen strategy
-        mask (numpy.ndarray): 2-dimensional array tracking the indices of sampled missing data sites (n_samples, n_sites)
     """
     def __init__(
         self,
         genotype_data,
         *,
+        tree_parser: Optional["TreeParser"] = None,
         prop_missing=0.1,
         strategy="random",
         missing_val=-9,
@@ -696,8 +225,10 @@ class SimGenotypeDataTransformer(BaseEstimator, TransformerMixin):
         verbose=0,
         tol=None,
         max_tries=None,
+        logger: logging.Logger | None = None,
     ) -> None:
         self.genotype_data = genotype_data
+        self.tree_parser = tree_parser
         self.prop_missing = prop_missing
         self.strategy = strategy
         self.missing_val = missing_val
@@ -705,396 +236,470 @@ class SimGenotypeDataTransformer(BaseEstimator, TransformerMixin):
         self.verbose = verbose
         self.tol = tol
         self.max_tries = max_tries
+        self.logger = logger or logging.getLogger(__name__)
-    def fit(self, X):
+    def fit(self, X: np.ndarray, y=None) -> "SimMissingTransformer":
         """Fit to input data X by simulating missing data.
         Missing data will be simulated in varying ways depending on the ``strategy`` setting.
         Args:
-            X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Data with which to simulate missing data. It should have already been imputed with one of the non-machine learning simple imputers, and there should be no missing data present in X.
+            X (np.ndarray): Data with which to simulate missing data. It should have already been imputed with one of the non-machine learning simple imputers, and there should be no missing data present in X.
         Raises:
-            TypeError: SimGenotypeData.tree must not be NoneType when using strategy="nonrandom" or "nonrandom_weighted".
+            TypeError: ``SimGenotypeDataTreeTransformer.tree`` must not be NoneType when using strategy="nonrandom" or "nonrandom_weighted".
             ValueError: Invalid ``strategy`` parameter provided.
         """
-        X = misc.validate_input_type(X, return_type="array").astype("float32")
+        X = np.asarray(validate_input_type(X, return_type="array")).astype("float32")
-        if self.verbose > 0:
-            print(
-                f"\nAdding {self.prop_missing} missing data per column "
-                f"using strategy: {self.strategy}"
-            )
+        self.logger.info(
+            f"Adding {self.prop_missing} missing data per column using strategy: {self.strategy}"
+        )
-        if np.all(np.isnan(np.array([self.missing_val])) == False):
+        if not np.isnan(self.missing_val):
+            X = X.copy()
             X[X == self.missing_val] = np.nan
         self.original_missing_mask_ = np.isnan(X)
         if self.strategy == "random":
-            if self.mask_missing:
-                # Get indexes where non-missing (Xobs) and missing (Xmiss).
-                Xobs = np.where(~self.original_missing_mask_.ravel())[0]
-                Xmiss = np.where(self.original_missing_mask_.ravel())[0]
-                # Generate mask of 0's (non-missing) and 1's (missing).
-                obs_mask = np.random.choice(
-                    [0, 1],
-                    size=Xobs.size,
-                    p=((1 - self.prop_missing), self.prop_missing),
-                ).astype(bool)
-                # Make missing data mask.
-                mask = np.zeros(X.size)
-                mask[Xobs] = obs_mask
-                mask[Xmiss] = 1
-                # Reshape from raveled to 2D.
-                # With strategy=="random", mask_ is equal to all_missing_.
-                self.mask_ = np.reshape(mask, X.shape)
+            present = ~self.original_missing_mask_
+            self.mask_ = np.zeros_like(X, dtype=bool)
+            # sample only over present sites
+            draws = np.random.random(X.shape)
+            self.mask_[present] = draws[present] < self.prop_missing
+            if self.mask_missing:
+                # keep original-missing as not simulated
+                pass
             else:
-                # Generate mask of 0's (non-missing) and 1's (missing).
-                self.mask_ = np.random.choice(
-                    [0, 1],
-                    size=X.shape,
-                    p=((1 - self.prop_missing), self.prop_missing),
-                ).astype(bool)
+                # optionally also include original-missing as masked (no-op in
+                # transform anyway)
+                self.mask_[~present] = True
-            # Make sure no entirely missing columns were simulated.
-            self._validate_mask()
+            self._validate_mask(use_non_original_only=True)
         elif self.strategy == "random_weighted":
-            self.mask_ = self.random_weighted_missing_data(X, inv=False)
+            self.mask_ = self.random_weighted_missing_data(
+                X, inv=False, target_rate=self.prop_missing
+            )
         elif self.strategy == "random_weighted_inv":
-            self.mask_ = self.random_weighted_missing_data(X, inv=True)
-        elif (
-            self.strategy == "nonrandom"
-            or self.strategy == "nonrandom_weighted"
-        ):
-            if self.genotype_data.tree is None:
-                raise TypeError(
-                    "SimGenotypeData.tree cannot be NoneType when "
-                    "strategy='nonrandom' or 'nonrandom_weighted'"
-                )
-            mask = np.full_like(X, 0.0, dtype=bool)
-            if self.strategy == "nonrandom_weighted":
-                weighted = True
-            else:
-                weighted = False
-            sample_map = dict()
-            for i, sample in enumerate(self.genotype_data.samples):
-                sample_map[sample] = i
-            # if no tolerance provided, set to 1 snp position
-            if self.tol is None:
-                self.tol = 1.0 / mask.size
+            self.mask_ = self.random_weighted_missing_data(
+                X, inv=True, target_rate=self.prop_missing
+            )
-            # if no max_tries provided, set to # inds
-            if self.max_tries is None:
-                self.max_tries = mask.shape[0]
+        elif self.strategy.startswith("nonrandom"):
+            if self.strategy not in {"nonrandom", "nonrandom_weighted"}:
+                msg = f"strategy must be one of {{'nonrandom','nonrandom_weighted'}}, got: {self.strategy}"
+                self.logger.error(msg)
+                raise ValueError(msg)
+            if self.tree_parser is None or not hasattr(self.tree_parser, "tree"):
+                msg = "SimMissingTransformer.tree cannot be NoneType when strategy='nonrandom' or strategy='nonrandom_weighted'"
+                self.logger.error(msg)
+                raise TypeError(msg)
+            rng = np.random.default_rng()
+            skip_root = True
+            weighted = self.strategy == "nonrandom_weighted"
+            # working mask
+            mask = np.zeros_like(X, dtype=bool)
+            # eligible cells
+            present = (
+                ~self.original_missing_mask_
+                if self.mask_missing
+                else np.ones_like(mask, dtype=bool)
+            )
-            filled = False
-            while not filled:
-                # Get list of samples from tree
-                samples = self._sample_tree(
-                    internal_only=False, skip_root=True, weighted=weighted
+            total_eligible = int(present.sum())
+            if total_eligible == 0:
+                self.mask_ = mask
+                self._validate_mask(use_non_original_only=self.mask_missing)
+                self.all_missing_mask_ = np.logical_or(
+                    self.mask_, self.original_missing_mask_
+                )
+                self.sim_missing_mask_ = np.logical_and(
+                    self.all_missing_mask_, ~self.original_missing_mask_
                 )
+                return self
+            target = int(round(self.prop_missing * total_eligible))
+            tol = int(
+                max(
+                    1,
+                    (self.tol if self.tol is not None else 1.0 / mask.size)
+                    * total_eligible,
+                )
+            )
-                # Convert to row indices
-                rows = [sample_map[i] for i in samples]
+            # map tip labels -> row indices
+            name_to_idx = {name: i for i, name in enumerate(self.genotype_data.samples)}
-                # Randomly sample a column
-                col_idx = np.random.randint(0, mask.shape[1])
-                sampled_col = copy.copy(mask[:, col_idx])
-                miss_mask = copy.copy(self.original_missing_mask_[:, col_idx])
+            max_outer = (
+                self.max_tries
+                if self.max_tries is not None
+                else max(10_000, mask.shape[0] * 10)
+            )
+            placed = int(mask.sum())
+            best_delta = abs(placed - target)
+            tries = 0
+            # simple per-locus quota to distribute hits
+            col_quota = np.full(
+                mask.shape[1],
+                max(1, int(np.ceil(target / max(1, mask.shape[1])))),
+                dtype=int,
+            )
-                # Mask column
-                sampled_col[rows] = True
+            while tries < max_outer and abs(placed - target) > tol:
+                tries += 1
+                # >>> Call _sample_tree here <<<
+                try:
+                    tips = self._sample_tree(
+                        internal_only=False,
+                        tips_only=False,
+                        skip_root=skip_root,
+                        weighted=weighted,
+                        rng=rng,
+                    )
+                except ValueError:
+                    # no eligible nodes or no tips intersect samples; try again
+                    continue
-                # If original was missing, set back to False.
-                if self.mask_missing:
-                    sampled_col[miss_mask] = False
+                # Convert to row indices; skip labels not in matrix
+                rows = [name_to_idx[t] for t in tips if t in name_to_idx]
+                if not rows:
+                    continue
+                # choose a column to edit
+                cols_left = np.flatnonzero(col_quota > 0)
+                if cols_left.size == 0:
+                    cols_left = np.arange(mask.shape[1])
+                j = int(rng.choice(cols_left))
-                # check that column is not 100% missing now
-                # if yes, sample again
-                if np.sum(sampled_col) == sampled_col.size:
+                # only edit eligible cells in this column
+                eligible_rows = np.fromiter(
+                    (r for r in rows if present[r, j]), dtype=int
+                )
+                if eligible_rows.size == 0:
                     continue
-                # if not, set values in mask matrix
-                else:
-                    mask[:, col_idx] = sampled_col
-                    # if this addition pushes missing % > self.prop_missing,
-                    # check previous prop_missing, remove masked samples from
-                    # this column until closest to target prop_missing
-                    current_prop = np.sum(mask) / mask.size
-                    if abs(current_prop - self.prop_missing) <= self.tol:
-                        filled = True
-                        break
-                    elif current_prop > self.prop_missing:
-                        tries = 0
-                        while (
-                            abs(current_prop - self.prop_missing) > self.tol
-                            and tries < self.max_tries
-                        ):
-                            r = np.random.randint(0, mask.shape[0])
-                            c = np.random.randint(0, mask.shape[1])
-                            mask[r, c] = False
-                            tries += 1
-                            current_prop = np.sum(mask) / mask.size
-                        filled = True
+                if placed < target:
+                    prev_col = mask[:, j].copy()
+                    mask[eligible_rows, j] = True
+                    # avoid fully missing column among observed
+                    col_after = mask[present[:, j], j]
+                    if col_after.all():
+                        idx_present = np.flatnonzero(present[:, j])
+                        k = int(rng.choice(idx_present))
+                        mask[k, j] = False
+                    new_placed = int(mask.sum())
+                    delta = abs(new_placed - target)
+                    if delta <= best_delta:
+                        best_delta = delta
+                        placed = new_placed
+                        col_quota[j] = max(0, col_quota[j] - 1)
                     else:
+                        mask[:, j] = prev_col
+                else:
+                    # remove within the same clade and column
+                    prev_col = mask[:, j].copy()
+                    col_idxs = eligible_rows[mask[eligible_rows, j]]
+                    if col_idxs.size == 0:
                         continue
+                    need = min(col_idxs.size, max(1, placed - target))
+                    to_clear = rng.choice(col_idxs, size=need, replace=False)
+                    mask[to_clear, j] = False
+                    new_placed = int(mask.sum())
+                    delta = abs(new_placed - target)
+                    if delta <= best_delta:
+                        best_delta = delta
+                        placed = new_placed
+                    else:
+                        mask[:, j] = prev_col
-            # With strategy=="nonrandom" or "nonrandom_weighted",
-            # mask_ is equal to sim_missing_mask_ if mask_missing is True.
-            # Otherwise it is equal to all_missing_.
             self.mask_ = mask
-            self._validate_mask()
+            self._validate_mask(use_non_original_only=self.mask_missing)
         else:
-            raise ValueError(
-                "Invalid SimGenotypeData.strategy value:", self.strategy
-            )
+            msg = f"Invalid SimMissingTransformer.strategy value: {self.strategy}"
+            self.logger.error(msg)
+            raise ValueError(msg)
         # Get all missing values.
-        self.all_missing_mask_ = np.logical_or(
-            self.mask_, self.original_missing_mask_
-        )
+        self.all_missing_mask_ = np.logical_or(self.mask_, self.original_missing_mask_)
         # Get values where original value was not missing and simulated.
         # data is missing.
         self.sim_missing_mask_ = np.logical_and(
             self.all_missing_mask_, self.original_missing_mask_ == False
         )
-        self._validate_mask(mask=self.mask_missing)
+        self._validate_mask(use_non_original_only=self.mask_missing)
         return self
-    def transform(self, X):
+    def transform(self, X: np.ndarray) -> np.ndarray:
         """Function to generate masked sites in a SimGenotypeData object
         Args:
-            X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Data to transform. No missing data should be present in X. It should have already been imputed with one of the non-machine learning simple imputers.
+            X (np.ndarray): Data to transform. No missing data should be present in X. It should have already been imputed with one of the non-machine learning simple imputers.
         Returns:
-            numpy.ndarray: Transformed data with missing data added.
+            np.ndarray: Transformed data with missing data added.
         """
-        X = misc.validate_input_type(X, return_type="array")
+        X = np.asarray(validate_input_type(X, return_type="array")).astype("float32")
         # mask 012-encoded and one-hot encoded genotypes.
         return self._mask_snps(X)
-    def accuracy(self, X_true, X_pred):
-        """Calculate imputation accuracy of the simulated genotypes.
-        Args:
-            X_true (np.ndarray): True values.
-            X_pred (np.ndarray): Imputed values.
-        Returns:
-            float: Accuracy score between X_true and X_pred.
-        '"""
-        masked_sites = np.sum(self.sim_missing_mask_)
-        num_correct = np.sum(
-            X_true[self.sim_missing_mask_] == X_pred[self.sim_missing_mask_]
-        )
-        return num_correct / masked_sites
-    def auc_roc_pr_ap(self, X_true, X_pred):
-        """Calcuate AUC-ROC, Precision-Recall, and Average Precision (AP).
+    def sqrt_transform(self, proportions: np.ndarray) -> np.ndarray:
+        """Apply the square root transformation to an array of proportions.
         Args:
-            X_true (np.ndarray): True values.
-            X_pred (np.ndarray): Imputed values.
+            proportions (np.ndarray): An array of proportions.
         Returns:
-            List[float]: List of AUC-ROC scores in order of: 0,1,2.
-            List[float]: List of precision scores in order of: 0,1,2.
-            List[float]: List of recall scores in order of: 0,1,2.
-            List[float]: List of average precision scores in order of 0,1,2.
+            np.ndarray: The transformed proportions.
         """
-        y_true = X_true[self.sim_missing_mask_]
-        y_pred = X_pred[self.sim_missing_mask_]
-        # Binarize the output
-        y_true_bin = label_binarize(y_true, classes=[0, 1, 2])
-        y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2])
-        # Initialize lists to hold the scores for each class
-        auc_roc_scores = []
-        precision_scores = []
-        recall_scores = []
-        avg_precision_scores = []
-        for i in range(y_true_bin.shape[1]):
-            # AUC-ROC score
-            auc_roc = roc_auc_score(
-                y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
-            )
-            auc_roc_scores.append(auc_roc)
-            # Precision-recall score
-            precision, recall, _, _ = precision_recall_fscore_support(
-                y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
-            )
-            precision_scores.append(precision)
-            recall_scores.append(recall)
+        return np.sqrt(proportions)
-            # Average precision score
-            avg_precision = average_precision_score(
-                y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
-            )
-            avg_precision_scores.append(avg_precision)
-        return (
-            auc_roc_scores,
-            precision_scores,
-            recall_scores,
-            avg_precision_scores,
-        )
+    def random_weighted_missing_data(
+        self,
+        X: np.ndarray,
+        transform_fn: Literal["sqrt", "exp"] = "sqrt",
+        power: float = 0.5,
+        inv: bool = False,
+        rng: np.random.Generator | None = None,
+        target_rate: float | None = None,  # if None, use realized draw
+    ) -> np.ndarray:
+        """Simulate missing data proportional or inversely proportional to genotype frequencies.
-    def random_weighted_missing_data(self, X, inv=False):
-        """Choose values for which to simulate missing data by biasing towards the minority or majority alleles, depending on whether inv is True or False.
+        This method simulates missing data in a genotype matrix based on genotype frequencies. It allows for different transformation functions to be applied to the base probabilities, and can optionally use inverse genotype frequencies.
         Args:
-            X (np.ndarray): True values.
-            inv (bool, optional): If True, then biases towards choosing majority alleles. If False, then biases towards choosing minority alleles. Defaults to False.
+            X (np.ndarray): Input genotype matrix.
+            transform_fn (Literal["sqrt", "exp"]): Transformation function to apply to base probabilities.
+            power (float): Exponent to raise transformed probabilities.
+            inv (bool): If True, use inverse genotype frequencies. If False, use direct frequencies to weight missingness.
+            rng (np.random.Generator | None): Optional NumPy Generator for reproducibility.
+            target_rate (float | None): If provided, scales the probabilities to achieve this target missing rate.
         Returns:
-            np.ndarray: X with simulated missing values.
-        """
-        # Get unique classes and their counts
-        classes, counts = np.unique(X, return_counts=True)
-        # Compute class weights
-        if inv:
-            class_weights = 1 / counts
-        else:
-            class_weights = counts
-        # Normalize class weights
-        class_weights = class_weights / sum(class_weights)
-        # Compute mask
-        if self.mask_missing:
-            # Get indexes where non-missing (Xobs) and missing (Xmiss)
-            Xobs = np.where(~self.original_missing_mask_.ravel())[0]
-            Xmiss = np.where(self.original_missing_mask_.ravel())[0]
-            # Generate mask of 0's (non-missing) and 1's (missing)
-            obs_mask = np.random.choice(
-                classes, size=Xobs.size, p=class_weights
+            np.ndarray: Simulated missing mask.
+        """
+        tf = transform_fn.lower()
+        if tf not in {"sqrt", "exp"}:
+            msg = f"transform_fn must be 'sqrt' or 'exp', got: {transform_fn}"
+            self.logger.error(msg)
+            raise ValueError(msg)
+        rng = np.random.default_rng() if rng is None else rng
+        eps = 1e-12
+        def _tf(arr: np.ndarray) -> np.ndarray:
+            arr = np.clip(arr, eps, None)
+            return np.sqrt(arr) if tf == "sqrt" else np.exp(-arr)
+        n_samples, n_snps = X.shape
+        out_mask = np.zeros((n_samples, n_snps), dtype=bool)
+        for j in range(n_snps):
+            col = X[:, j]
+            present = ~np.isnan(col)
+            if not np.any(present):
+                continue
+            vals = col[present]
+            classes, counts = np.unique(vals, return_counts=True)
+            if classes.size == 1:  # never wipe entire column
+                continue
+            p = counts.astype(float) / counts.sum()
+            base = 1.0 / np.clip(p, eps, None) if inv else p
+            w = _tf(base)
+            w = np.clip(w, 0.0, None) ** power
+            s = w.sum()
+            w = (
+                np.full_like(w, 1.0 / w.size, dtype=float)
+                if (s <= 0 or ~np.isfinite(s))
+                else (w / s)
             )
-            obs_mask = (obs_mask == classes[:, None]).argmax(axis=0)
-            # Make missing data mask
-            mask = np.zeros(X.size, dtype=bool)
-            mask[Xobs] = obs_mask
-            mask[Xmiss] = 1
+            probs = np.zeros(n_samples, dtype=float)
+            for c, pw in zip(classes, w):
+                probs[present & (col == c)] = pw
-            # Reshape from raveled to 2D
-            mask = mask.reshape(X.shape)
-        else:
-            # Generate mask of 0's (non-missing) and 1's (missing)
-            mask = np.random.choice(classes, size=X.size, p=class_weights)
-            mask = (mask == classes[:, None]).argmax(axis=0).reshape(X.shape)
+            if target_rate is not None:
+                probs *= float(target_rate)  # scale global intensity
-        # Assign mask to self before validation
-        self.mask_ = mask
+            draws = rng.random(n_samples)
+            out_mask[:, j] = draws < probs
+            out_mask[~present, j] = False  # never alter already-missing
-        self._validate_mask()
+            # guard against accidentally wiping this column (using only non-original-missing)
+            col_after = out_mask[present, j]
+            if col_after.sum() == col_after.size:
+                # clear a random observed index
+                k = rng.integers(0, col_after.size)
+                out_mask[np.flatnonzero(present)[k], j] = False
-        return mask
+        return out_mask
     def _sample_tree(
         self,
-        internal_only=False,
-        tips_only=False,
-        skip_root=True,
-        weighted=False,
-    ):
-        """Function for randomly sampling clades from SimGenotypeData.tree.
-        Args:
-            internal_only (bool): Only sample from NON-TIPS. Defaults to False.
-            tips_only (bool): Only sample from tips. Defaults to False.
+        internal_only: bool = False,
+        tips_only: bool = False,
+        skip_root: bool = True,
+        weighted: bool = False,
+        rng: np.random.Generator | None = None,
+    ) -> list[str]:
+        """Sample a node and return descendant tip labels.
-            skip_root (bool): Exclude sampling of root node. Defaults to True.
+        This method samples a node from the genotype tree and retrieves the tip labels of all descendant nodes. The sampling can be restricted to internal nodes, tip nodes, or can exclude the root node. Additionally, the sampling can be weighted by branch lengths.
-            weighted (bool): Weight sampling by branch length. Defaults to False.
+        Args:
+            internal_only: Sample only internal nodes.
+            tips_only: Sample only tip nodes.
+            skip_root: Exclude the root from sampling.
+            weighted: Weight node sampling by branch length.
+            rng: Optional NumPy Generator for reproducibility.
         Returns:
-            List[str]: List of descendant tips from the sampled node.
+            List[str]: Tip labels under the sampled node.
         Raises:
-            ValueError: ``tips_only`` and ``internal_only`` cannot both be True.
+            ValueError: If no eligible nodes exist or both tips_only and internal_only are True.
         """
         if tips_only and internal_only:
-            raise ValueError("internal_only and tips_only cannot both be true")
-        # to only sample internal nodes add  if not i.is_leaf()
-        node_dict = dict()
-        for node in self.genotype_data.tree.treenode.traverse("preorder"):
-            ## node.idx is node indexes.
-            ## node.dist is branch lengths.
-            if skip_root:
-                # If root node.
-                if node.idx == self.genotype_data.tree.nnodes - 1:
-                    continue
-            if tips_only and internal_only:
-                raise ValueError(
-                    "tips_only and internal_only cannot both be True"
+            msg = "tips_only and internal_only cannot both be True"
+            self.logger.error(msg)
+            raise ValueError(msg)
+        rng = np.random.default_rng() if rng is None else rng
+        node_dict: dict[int | object, float] = {}
+        if self.tree_parser is None or not hasattr(self.tree_parser, "tree"):
+            msg = "SimMissingTransformer.tree cannot be NoneType when strategy='nonrandom' or strategy='nonrandom_weighted'"
+            self.logger.error(msg)
+            raise TypeError(msg)
+        # Traverse using the tree backend you have; be tolerant of API differences.
+        for node in self.tree_parser.tree.treenode.traverse("preorder"):
+            # Robust root detection: prefer is_root(), then fall back to parent None, finally fall back to idx==nnodes-1 only if needed.
+            is_root = False
+            if hasattr(node, "is_root"):
+                is_root = bool(node.is_root())
+            elif getattr(node, "up", None) is None:
+                is_root = True
+            elif hasattr(self.tree_parser.tree, "nnodes") and hasattr(node, "idx"):
+                is_root = node.idx == self.tree_parser.tree.nnodes - 1
+            if skip_root and is_root:
+                continue
+            if tips_only and not node.is_leaf():
+                continue
+            if internal_only and node.is_leaf():
+                continue
+            # Branch length; coerce invalid to 0
+            dist = float(getattr(node, "dist", 0.0) or 0.0)
+            if not np.isfinite(dist):
+                dist = 0.0
+            # Use node.idx if stable, else the node object as key
+            key = getattr(node, "idx", node)
+            node_dict[key] = dist
+        if not node_dict:
+            msg = "No eligible nodes found to sample from the tree."
+            self.logger.error(msg)
+            raise ValueError(msg)
+        keys = np.array(list(node_dict.keys()), dtype=object)
+        weights = np.asarray(list(node_dict.values()), dtype=float)
+        weights[~np.isfinite(weights)] = 0.0
+        sample_set = set(self.genotype_data.samples)
+        def _choose_key() -> object:
+            if weighted and weights.sum() > 0.0:
+                p = weights / weights.sum()
+                return rng.choice(keys, p=p)
+            return rng.choice(keys)
+        tree = self.tree_parser.tree
+        last_error: Optional[Exception] = None
+        max_attempts = max(1, len(keys) * 3)
+        for _ in range(max_attempts):
+            chosen_key = _choose_key()
+            # 1. Resolve chosen_key to a Node object
+            try:
+                if isinstance(chosen_key, (int, np.integer)):
+                    node = tree[int(chosen_key)]
+                else:
+                    node = chosen_key
+            except Exception as e:
+                last_error = e
+                continue
+            # 2. Retrieve leaves for this specific node
+            if not hasattr(node, "get_leaves"):
+                last_error = TypeError(
+                    f"Object {type(node)} does not have a get_leaves method."
                 )
+                continue
+            try:
+                tips = [leaf.name for leaf in node.get_leaves()]  # type: ignore
+            except Exception as e:
+                last_error = e
+                continue
+            # Filter to sample IDs present in the matrix
+            tips = [t for t in tips if t in sample_set]
+            if tips:
+                return tips
+        msg = (
+            "No sampled clades contain tips present in genotype_data.samples. "
+            "Check that tree tip names match the genotype_data samples."
+        )
+        self.logger.error(msg)
+        if last_error:
+            raise ValueError(msg) from last_error
+        raise ValueError(msg)
-            if tips_only:
-                if not node.is_leaf():
-                    continue
-            elif internal_only:
-                if node.is_leaf():
-                    continue
-            node_dict[node.idx] = node.dist
-        if weighted:
-            s = sum(list(node_dict.values()))
-            # Node index / sum of node distances.
-            p = [i / s for i in list(node_dict.values())]
-            node_idx = np.random.choice(list(node_dict.keys()), size=1, p=p)[0]
-        else:
-            # Get missing choice from random clade.
-            node_idx = np.random.choice(list(node_dict.keys()), size=1)[0]
-        return self.genotype_data.tree.get_tip_labels(idx=node_idx)
-    def _validate_mask(self, mask=False):
-        """Make sure no entirely missing columns are simulated."""
-        if mask is None:
-            mask = self.mask_
-        for i, column in enumerate(self.mask_.T):
-            if mask:
-                miss_mask = self.original_missing_mask_[:, i]
-                col = column[~miss_mask]
-                obs_idx = np.where(~miss_mask)
-                idx = obs_idx[np.random.choice(np.arange(len(obs_idx)))]
+    def _validate_mask(self, use_non_original_only: bool = False) -> None:
+        """Ensure no column is entirely masked on observed entries.
+        Args:
+            use_non_original_only (bool): If True, only consider non-original-missing entries when validating. Defaults to False.
+        """
+        m = self.mask_
+        for j in range(m.shape[1]):
+            if use_non_original_only:
+                obs = ~self.original_missing_mask_[:, j]
             else:
-                col = column
-                idx = np.random.choice(np.arange(col.shape[0]))
-            if np.sum(col) == col.size:
-                self.mask_[idx, i] = False
+                obs = np.ones(m.shape[0], dtype=bool)
+            if not np.any(obs):
+                continue
+            col = m[obs, j]
+            if col.size and col.all():
+                # clear one random observed index
+                idxs = np.flatnonzero(obs)
+                k = np.random.randint(0, idxs.size)
+                self.mask_[idxs[k], j] = False
     def _mask_snps(self, X):
         """Mask positions in SimGenotypeData.snps and SimGenotypeData.onehot"""
@@ -1112,6 +717,51 @@ class SimGenotypeDataTransformer(BaseEstimator, TransformerMixin):
         Xt[mask_boolean] = mask_val
         return Xt
+    def write_mask(self, filename_prefix: str):
+        """Write mask to file.
+        Args:
+            filename_prefix (str): Prefix for the filenames to write to.
+        """
+        np.save(filename_prefix + "_mask.npy", self.mask_)
+        np.save(
+            filename_prefix + "_original_missing_mask.npy",
+            self.original_missing_mask_,
+        )
+    def read_mask(
+        self, filename_prefix: str
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Read mask from file.
+        Args:
+            filename_prefix (str): Prefix for the filenames to read from.
+        Returns:
+            Tuple[np.ndarray, np.ndarray, np.ndarray]: The read masks. (mask, original_missing_mask, all_missing_mask).
+        """
+        # Check if files exist
+        if not Path(filename_prefix + "_mask.npy").is_file():
+            msg = filename_prefix + "_mask.npy" + " does not exist."
+            self.logger.error(msg)
+            raise FileNotFoundError(msg)
+        if not Path(filename_prefix + "_original_missing_mask.npy").is_file():
+            msg = filename_prefix + "_original_missing_mask.npy" + " does not exist."
+            self.logger.error(msg)
+            raise FileNotFoundError(msg)
+        # Load mask from file
+        self.mask_ = np.load(filename_prefix + "_mask.npy")
+        self.original_missing_mask_ = np.load(
+            filename_prefix + "_original_missing_mask.npy"
+        )
+        # Recalculate all_missing_mask_ from mask_ and original_missing_mask_
+        self.all_missing_mask_ = np.logical_or(self.mask_, self.original_missing_mask_)
+        return self.mask_, self.original_missing_mask_, self.all_missing_mask_
     @property
     def missing_count(self) -> int:
         """Count of masked genotypes in SimGenotypeData.mask

pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

pg-sui 0.2.0py3-none-any.whl → 1.6.14.dev9py3-none-any.whl