PyPI - pg-sui - Versions diffs - 1.6.14.dev9__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.14.dev9py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

pg_sui-1.7.0.dist-info/METADATA +288 -0
{pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +591 -126
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +218 -533
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1268 -530
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +869 -764
pgsui/impute/unsupervised/imputers/vae.py +928 -696
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pg_sui-1.6.14.dev9.dist-info/METADATA +0 -344
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1554
pgsui/impute/unsupervised/imputers/ubp.py +0 -1575
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/data_processing/transformers.py CHANGED Viewed

@@ -209,7 +209,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
     Attributes:
         original_missing_mask_ (numpy.ndarray): Array with boolean mask for original missing locations.
-        simulated_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
+        sim_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
         all_missing_mask_ (numpy.ndarray): Array with boolean mask for all missing locations, including both simulated and original.
     """
@@ -225,8 +225,24 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
         verbose=0,
         tol=None,
         max_tries=None,
+        seed: Optional[int] = None,
         logger: logging.Logger | None = None,
     ) -> None:
+        """Initialize the SimMissingTransformer.
+        Args:
+            genotype_data (GenotypeData object): GenotypeData instance.
+            tree_parser (TreeParser | None): TreeParser instance with a loaded tree. Required for "nonrandom" and "nonrandom_weighted" strategies.
+            prop_missing (float, optional): Proportion of missing data desired in output. Must be in the interval [0, 1]. Defaults to 0.1
+            strategy (Literal["nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", "random"]): Strategy for simulating missing data. "random": Uniformly masks genotypes at random among eligible entries until the target missing proportion is reached. "random_weighted": Masks genotypes at random with probabilities proportional to their observed genotype frequencies in each column (more common genotypes are more likely to be masked). "random_weighted_inv": Masks genotypes at random with probabilities inversely proportional to their observed genotype frequencies in each column (rarer genotypes are more likely to be masked). "nonrandom": Uses the supplied genotype tree to place missing data on clades that are sampled uniformly from internal and/or tip nodes, producing phylogenetically clustered missingness. "nonrandom_weighted": As in "nonrandom", but clades are sampled with probabilities proportional to their branch lengths, concentrating missingness on longer branches (e.g., mimicking locus dropout tied to evolutionary divergence). Defaults to "random".
+            missing_val (int, optional): Value that represents missing data. Defaults to -9.
+            mask_missing (bool, optional): True if you want to skip original missing values when simulating new missing data, False otherwise. Defaults to True.
+            verbose (bool, optional): Verbosity level. Defaults to 0.
+            tol (float): Tolerance to reach proportion specified in self.prop_missing. Defaults to 1/num_snps*num_inds
+            max_tries (int): Maximum number of tries to reach targeted missing data proportion within specified tol. If None, num_inds will be used. Defaults to None.
+            seed (int | None): RNG seed.
+            logger (logging.Logger | None): Logger for messages.
+        """
         self.genotype_data = genotype_data
         self.tree_parser = tree_parser
         self.prop_missing = prop_missing
@@ -236,6 +252,10 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
         self.verbose = verbose
         self.tol = tol
         self.max_tries = max_tries
+        self.seed = seed
+        self.rng = (
+            np.random.default_rng(seed) if seed is not None else np.random.default_rng()
+        )
         self.logger = logger or logging.getLogger(__name__)
     def fit(self, X: np.ndarray, y=None) -> "SimMissingTransformer":
@@ -252,7 +272,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
         """
         X = np.asarray(validate_input_type(X, return_type="array")).astype("float32")
-        self.logger.info(
+        self.logger.debug(
             f"Adding {self.prop_missing} missing data per column using strategy: {self.strategy}"
         )
@@ -267,7 +287,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
             self.mask_ = np.zeros_like(X, dtype=bool)
             # sample only over present sites
-            draws = np.random.random(X.shape)
+            draws = self.rng.random(X.shape)
             self.mask_[present] = draws[present] < self.prop_missing
             if self.mask_missing:
@@ -301,7 +321,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
                 self.logger.error(msg)
                 raise TypeError(msg)
-            rng = np.random.default_rng()
             skip_root = True
             weighted = self.strategy == "nonrandom_weighted"
@@ -365,7 +384,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
                         tips_only=False,
                         skip_root=skip_root,
                         weighted=weighted,
-                        rng=rng,
+                        rng=self.rng,
                     )
                 except ValueError:
                     # no eligible nodes or no tips intersect samples; try again
@@ -380,7 +399,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
                 cols_left = np.flatnonzero(col_quota > 0)
                 if cols_left.size == 0:
                     cols_left = np.arange(mask.shape[1])
-                j = int(rng.choice(cols_left))
+                j = int(self.rng.choice(cols_left))
                 # only edit eligible cells in this column
                 eligible_rows = np.fromiter(
@@ -397,7 +416,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
                     col_after = mask[present[:, j], j]
                     if col_after.all():
                         idx_present = np.flatnonzero(present[:, j])
-                        k = int(rng.choice(idx_present))
+                        k = int(self.rng.choice(idx_present))
                         mask[k, j] = False
                     new_placed = int(mask.sum())
@@ -415,7 +434,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
                     if col_idxs.size == 0:
                         continue
                     need = min(col_idxs.size, max(1, placed - target))
-                    to_clear = rng.choice(col_idxs, size=need, replace=False)
+                    to_clear = self.rng.choice(col_idxs, size=need, replace=False)
                     mask[to_clear, j] = False
                     new_placed = int(mask.sum())
@@ -501,7 +520,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
             self.logger.error(msg)
             raise ValueError(msg)
-        rng = np.random.default_rng() if rng is None else rng
         eps = 1e-12
         def _tf(arr: np.ndarray) -> np.ndarray:
@@ -538,9 +556,12 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
                 probs[present & (col == c)] = pw
             if target_rate is not None:
-                probs *= float(target_rate)  # scale global intensity
+                mean_p = probs[present].mean()
+                if mean_p > 0:
+                    probs *= float(target_rate) / mean_p
+            probs = np.clip(probs, 0.0, 1.0)
-            draws = rng.random(n_samples)
+            draws = self.rng.random(n_samples)
             out_mask[:, j] = draws < probs
             out_mask[~present, j] = False  # never alter already-missing
@@ -548,7 +569,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
             col_after = out_mask[present, j]
             if col_after.sum() == col_after.size:
                 # clear a random observed index
-                k = rng.integers(0, col_after.size)
+                k = self.rng.integers(0, col_after.size)
                 out_mask[np.flatnonzero(present)[k], j] = False
         return out_mask
@@ -583,8 +604,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
             self.logger.error(msg)
             raise ValueError(msg)
-        rng = np.random.default_rng() if rng is None else rng
         node_dict: dict[int | object, float] = {}
         if self.tree_parser is None or not hasattr(self.tree_parser, "tree"):
@@ -633,8 +652,8 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
         def _choose_key() -> object:
             if weighted and weights.sum() > 0.0:
                 p = weights / weights.sum()
-                return rng.choice(keys, p=p)
-            return rng.choice(keys)
+                return self.rng.choice(keys, p=p)
+            return self.rng.choice(keys)
         tree = self.tree_parser.tree
         last_error: Optional[Exception] = None
@@ -698,17 +717,22 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
             if col.size and col.all():
                 # clear one random observed index
                 idxs = np.flatnonzero(obs)
-                k = np.random.randint(0, idxs.size)
+                k = self.rng.integers(0, idxs.size)
                 self.mask_[idxs[k], j] = False
     def _mask_snps(self, X):
         """Mask positions in SimGenotypeData.snps and SimGenotypeData.onehot"""
-        if len(X.shape) == 3:
+        if X.ndim == 3:
             # One-hot encoded.
             mask_val = [0.0, 0.0, 0.0, 0.0]
-        elif len(X.shape) == 2:
+        elif X.ndim == 2:
             # 012-encoded.
-            mask_val = -9
+            mask_val = (
+                float(self.missing_val)
+                if np.isnan(self.missing_val)
+                else self.missing_val
+            )
         else:
             raise ValueError(f"Invalid shape of input X: {X.shape}")

pg-sui 1.6.14.dev9__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.14.dev9py3-none-any.whl → 1.7.0py3-none-any.whl