pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
- {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
- pgsui/__init__.py +0 -8
- pgsui/_version.py +2 -2
- pgsui/cli.py +577 -125
- pgsui/data_processing/config.py +1 -2
- pgsui/data_processing/containers.py +203 -530
- pgsui/data_processing/transformers.py +44 -20
- pgsui/impute/deterministic/imputers/mode.py +475 -182
- pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
- pgsui/impute/supervised/imputers/random_forest.py +3 -2
- pgsui/impute/unsupervised/base.py +1269 -534
- pgsui/impute/unsupervised/callbacks.py +28 -33
- pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
- pgsui/impute/unsupervised/imputers/vae.py +931 -787
- pgsui/impute/unsupervised/loss_functions.py +156 -202
- pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
- pgsui/impute/unsupervised/models/vae_model.py +40 -221
- pgsui/impute/unsupervised/nn_scorers.py +53 -13
- pgsui/utils/classification_viz.py +240 -97
- pgsui/utils/misc.py +201 -3
- pgsui/utils/plotting.py +73 -58
- pgsui/utils/pretty_metrics.py +2 -6
- pgsui/utils/scorers.py +39 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
- pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
- pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
- pgsui/impute/unsupervised/models/ubp_model.py +0 -200
- {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
- {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
- {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -209,7 +209,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
209
209
|
|
|
210
210
|
Attributes:
|
|
211
211
|
original_missing_mask_ (numpy.ndarray): Array with boolean mask for original missing locations.
|
|
212
|
-
|
|
212
|
+
sim_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
|
|
213
213
|
all_missing_mask_ (numpy.ndarray): Array with boolean mask for all missing locations, including both simulated and original.
|
|
214
214
|
"""
|
|
215
215
|
|
|
@@ -225,8 +225,24 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
225
225
|
verbose=0,
|
|
226
226
|
tol=None,
|
|
227
227
|
max_tries=None,
|
|
228
|
+
seed: Optional[int] = None,
|
|
228
229
|
logger: logging.Logger | None = None,
|
|
229
230
|
) -> None:
|
|
231
|
+
"""Initialize the SimMissingTransformer.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
genotype_data (GenotypeData object): GenotypeData instance.
|
|
235
|
+
tree_parser (TreeParser | None): TreeParser instance with a loaded tree. Required for "nonrandom" and "nonrandom_weighted" strategies.
|
|
236
|
+
prop_missing (float, optional): Proportion of missing data desired in output. Must be in the interval [0, 1]. Defaults to 0.1
|
|
237
|
+
strategy (Literal["nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", "random"]): Strategy for simulating missing data. "random": Uniformly masks genotypes at random among eligible entries until the target missing proportion is reached. "random_weighted": Masks genotypes at random with probabilities proportional to their observed genotype frequencies in each column (more common genotypes are more likely to be masked). "random_weighted_inv": Masks genotypes at random with probabilities inversely proportional to their observed genotype frequencies in each column (rarer genotypes are more likely to be masked). "nonrandom": Uses the supplied genotype tree to place missing data on clades that are sampled uniformly from internal and/or tip nodes, producing phylogenetically clustered missingness. "nonrandom_weighted": As in "nonrandom", but clades are sampled with probabilities proportional to their branch lengths, concentrating missingness on longer branches (e.g., mimicking locus dropout tied to evolutionary divergence). Defaults to "random".
|
|
238
|
+
missing_val (int, optional): Value that represents missing data. Defaults to -9.
|
|
239
|
+
mask_missing (bool, optional): True if you want to skip original missing values when simulating new missing data, False otherwise. Defaults to True.
|
|
240
|
+
verbose (bool, optional): Verbosity level. Defaults to 0.
|
|
241
|
+
tol (float): Tolerance to reach proportion specified in self.prop_missing. Defaults to 1/num_snps*num_inds
|
|
242
|
+
max_tries (int): Maximum number of tries to reach targeted missing data proportion within specified tol. If None, num_inds will be used. Defaults to None.
|
|
243
|
+
seed (int | None): RNG seed.
|
|
244
|
+
logger (logging.Logger | None): Logger for messages.
|
|
245
|
+
"""
|
|
230
246
|
self.genotype_data = genotype_data
|
|
231
247
|
self.tree_parser = tree_parser
|
|
232
248
|
self.prop_missing = prop_missing
|
|
@@ -236,6 +252,10 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
236
252
|
self.verbose = verbose
|
|
237
253
|
self.tol = tol
|
|
238
254
|
self.max_tries = max_tries
|
|
255
|
+
self.seed = seed
|
|
256
|
+
self.rng = (
|
|
257
|
+
np.random.default_rng(seed) if seed is not None else np.random.default_rng()
|
|
258
|
+
)
|
|
239
259
|
self.logger = logger or logging.getLogger(__name__)
|
|
240
260
|
|
|
241
261
|
def fit(self, X: np.ndarray, y=None) -> "SimMissingTransformer":
|
|
@@ -252,7 +272,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
252
272
|
"""
|
|
253
273
|
X = np.asarray(validate_input_type(X, return_type="array")).astype("float32")
|
|
254
274
|
|
|
255
|
-
self.logger.
|
|
275
|
+
self.logger.debug(
|
|
256
276
|
f"Adding {self.prop_missing} missing data per column using strategy: {self.strategy}"
|
|
257
277
|
)
|
|
258
278
|
|
|
@@ -267,7 +287,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
267
287
|
self.mask_ = np.zeros_like(X, dtype=bool)
|
|
268
288
|
|
|
269
289
|
# sample only over present sites
|
|
270
|
-
draws =
|
|
290
|
+
draws = self.rng.random(X.shape)
|
|
271
291
|
self.mask_[present] = draws[present] < self.prop_missing
|
|
272
292
|
|
|
273
293
|
if self.mask_missing:
|
|
@@ -301,7 +321,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
301
321
|
self.logger.error(msg)
|
|
302
322
|
raise TypeError(msg)
|
|
303
323
|
|
|
304
|
-
rng = np.random.default_rng()
|
|
305
324
|
skip_root = True
|
|
306
325
|
weighted = self.strategy == "nonrandom_weighted"
|
|
307
326
|
|
|
@@ -365,7 +384,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
365
384
|
tips_only=False,
|
|
366
385
|
skip_root=skip_root,
|
|
367
386
|
weighted=weighted,
|
|
368
|
-
rng=rng,
|
|
387
|
+
rng=self.rng,
|
|
369
388
|
)
|
|
370
389
|
except ValueError:
|
|
371
390
|
# no eligible nodes or no tips intersect samples; try again
|
|
@@ -380,7 +399,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
380
399
|
cols_left = np.flatnonzero(col_quota > 0)
|
|
381
400
|
if cols_left.size == 0:
|
|
382
401
|
cols_left = np.arange(mask.shape[1])
|
|
383
|
-
j = int(rng.choice(cols_left))
|
|
402
|
+
j = int(self.rng.choice(cols_left))
|
|
384
403
|
|
|
385
404
|
# only edit eligible cells in this column
|
|
386
405
|
eligible_rows = np.fromiter(
|
|
@@ -397,7 +416,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
397
416
|
col_after = mask[present[:, j], j]
|
|
398
417
|
if col_after.all():
|
|
399
418
|
idx_present = np.flatnonzero(present[:, j])
|
|
400
|
-
k = int(rng.choice(idx_present))
|
|
419
|
+
k = int(self.rng.choice(idx_present))
|
|
401
420
|
mask[k, j] = False
|
|
402
421
|
|
|
403
422
|
new_placed = int(mask.sum())
|
|
@@ -415,7 +434,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
415
434
|
if col_idxs.size == 0:
|
|
416
435
|
continue
|
|
417
436
|
need = min(col_idxs.size, max(1, placed - target))
|
|
418
|
-
to_clear = rng.choice(col_idxs, size=need, replace=False)
|
|
437
|
+
to_clear = self.rng.choice(col_idxs, size=need, replace=False)
|
|
419
438
|
mask[to_clear, j] = False
|
|
420
439
|
|
|
421
440
|
new_placed = int(mask.sum())
|
|
@@ -501,7 +520,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
501
520
|
self.logger.error(msg)
|
|
502
521
|
raise ValueError(msg)
|
|
503
522
|
|
|
504
|
-
rng = np.random.default_rng() if rng is None else rng
|
|
505
523
|
eps = 1e-12
|
|
506
524
|
|
|
507
525
|
def _tf(arr: np.ndarray) -> np.ndarray:
|
|
@@ -538,9 +556,12 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
538
556
|
probs[present & (col == c)] = pw
|
|
539
557
|
|
|
540
558
|
if target_rate is not None:
|
|
541
|
-
|
|
559
|
+
mean_p = probs[present].mean()
|
|
560
|
+
if mean_p > 0:
|
|
561
|
+
probs *= float(target_rate) / mean_p
|
|
562
|
+
probs = np.clip(probs, 0.0, 1.0)
|
|
542
563
|
|
|
543
|
-
draws = rng.random(n_samples)
|
|
564
|
+
draws = self.rng.random(n_samples)
|
|
544
565
|
out_mask[:, j] = draws < probs
|
|
545
566
|
out_mask[~present, j] = False # never alter already-missing
|
|
546
567
|
|
|
@@ -548,7 +569,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
548
569
|
col_after = out_mask[present, j]
|
|
549
570
|
if col_after.sum() == col_after.size:
|
|
550
571
|
# clear a random observed index
|
|
551
|
-
k = rng.integers(0, col_after.size)
|
|
572
|
+
k = self.rng.integers(0, col_after.size)
|
|
552
573
|
out_mask[np.flatnonzero(present)[k], j] = False
|
|
553
574
|
|
|
554
575
|
return out_mask
|
|
@@ -583,8 +604,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
583
604
|
self.logger.error(msg)
|
|
584
605
|
raise ValueError(msg)
|
|
585
606
|
|
|
586
|
-
rng = np.random.default_rng() if rng is None else rng
|
|
587
|
-
|
|
588
607
|
node_dict: dict[int | object, float] = {}
|
|
589
608
|
|
|
590
609
|
if self.tree_parser is None or not hasattr(self.tree_parser, "tree"):
|
|
@@ -633,8 +652,8 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
633
652
|
def _choose_key() -> object:
|
|
634
653
|
if weighted and weights.sum() > 0.0:
|
|
635
654
|
p = weights / weights.sum()
|
|
636
|
-
return rng.choice(keys, p=p)
|
|
637
|
-
return rng.choice(keys)
|
|
655
|
+
return self.rng.choice(keys, p=p)
|
|
656
|
+
return self.rng.choice(keys)
|
|
638
657
|
|
|
639
658
|
tree = self.tree_parser.tree
|
|
640
659
|
last_error: Optional[Exception] = None
|
|
@@ -698,17 +717,22 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
|
|
|
698
717
|
if col.size and col.all():
|
|
699
718
|
# clear one random observed index
|
|
700
719
|
idxs = np.flatnonzero(obs)
|
|
701
|
-
k =
|
|
720
|
+
k = self.rng.integers(0, idxs.size)
|
|
702
721
|
self.mask_[idxs[k], j] = False
|
|
703
722
|
|
|
704
723
|
def _mask_snps(self, X):
|
|
705
724
|
"""Mask positions in SimGenotypeData.snps and SimGenotypeData.onehot"""
|
|
706
|
-
if
|
|
725
|
+
if X.ndim == 3:
|
|
707
726
|
# One-hot encoded.
|
|
708
727
|
mask_val = [0.0, 0.0, 0.0, 0.0]
|
|
709
|
-
elif
|
|
728
|
+
elif X.ndim == 2:
|
|
710
729
|
# 012-encoded.
|
|
711
|
-
mask_val =
|
|
730
|
+
mask_val = (
|
|
731
|
+
float(self.missing_val)
|
|
732
|
+
if np.isnan(self.missing_val)
|
|
733
|
+
else self.missing_val
|
|
734
|
+
)
|
|
735
|
+
|
|
712
736
|
else:
|
|
713
737
|
raise ValueError(f"Invalid shape of input X: {X.shape}")
|
|
714
738
|
|