pg-sui 1.6.14.dev9__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. pg_sui-1.7.0.dist-info/METADATA +288 -0
  2. {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
  3. pgsui/__init__.py +0 -8
  4. pgsui/_version.py +2 -2
  5. pgsui/cli.py +591 -126
  6. pgsui/data_processing/config.py +1 -2
  7. pgsui/data_processing/containers.py +218 -533
  8. pgsui/data_processing/transformers.py +44 -20
  9. pgsui/impute/deterministic/imputers/mode.py +475 -182
  10. pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
  11. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
  12. pgsui/impute/supervised/imputers/random_forest.py +3 -2
  13. pgsui/impute/unsupervised/base.py +1268 -530
  14. pgsui/impute/unsupervised/callbacks.py +28 -33
  15. pgsui/impute/unsupervised/imputers/autoencoder.py +869 -764
  16. pgsui/impute/unsupervised/imputers/vae.py +928 -696
  17. pgsui/impute/unsupervised/loss_functions.py +156 -202
  18. pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
  19. pgsui/impute/unsupervised/models/vae_model.py +40 -221
  20. pgsui/impute/unsupervised/nn_scorers.py +53 -13
  21. pgsui/utils/classification_viz.py +240 -97
  22. pgsui/utils/misc.py +201 -3
  23. pgsui/utils/plotting.py +73 -58
  24. pgsui/utils/pretty_metrics.py +2 -6
  25. pgsui/utils/scorers.py +39 -0
  26. pg_sui-1.6.14.dev9.dist-info/METADATA +0 -344
  27. pgsui/impute/unsupervised/imputers/nlpca.py +0 -1554
  28. pgsui/impute/unsupervised/imputers/ubp.py +0 -1575
  29. pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
  30. pgsui/impute/unsupervised/models/ubp_model.py +0 -200
  31. {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
  32. {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
  33. {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
  34. {pg_sui-1.6.14.dev9.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0
@@ -209,7 +209,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
209
209
 
210
210
  Attributes:
211
211
  original_missing_mask_ (numpy.ndarray): Array with boolean mask for original missing locations.
212
- simulated_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
212
+ sim_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
213
213
  all_missing_mask_ (numpy.ndarray): Array with boolean mask for all missing locations, including both simulated and original.
214
214
  """
215
215
 
@@ -225,8 +225,24 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
225
225
  verbose=0,
226
226
  tol=None,
227
227
  max_tries=None,
228
+ seed: Optional[int] = None,
228
229
  logger: logging.Logger | None = None,
229
230
  ) -> None:
231
+ """Initialize the SimMissingTransformer.
232
+
233
+ Args:
234
+ genotype_data (GenotypeData object): GenotypeData instance.
235
+ tree_parser (TreeParser | None): TreeParser instance with a loaded tree. Required for "nonrandom" and "nonrandom_weighted" strategies.
236
+ prop_missing (float, optional): Proportion of missing data desired in output. Must be in the interval [0, 1]. Defaults to 0.1
237
+ strategy (Literal["nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", "random"]): Strategy for simulating missing data. "random": Uniformly masks genotypes at random among eligible entries until the target missing proportion is reached. "random_weighted": Masks genotypes at random with probabilities proportional to their observed genotype frequencies in each column (more common genotypes are more likely to be masked). "random_weighted_inv": Masks genotypes at random with probabilities inversely proportional to their observed genotype frequencies in each column (rarer genotypes are more likely to be masked). "nonrandom": Uses the supplied genotype tree to place missing data on clades that are sampled uniformly from internal and/or tip nodes, producing phylogenetically clustered missingness. "nonrandom_weighted": As in "nonrandom", but clades are sampled with probabilities proportional to their branch lengths, concentrating missingness on longer branches (e.g., mimicking locus dropout tied to evolutionary divergence). Defaults to "random".
238
+ missing_val (int, optional): Value that represents missing data. Defaults to -9.
239
+ mask_missing (bool, optional): True if you want to skip original missing values when simulating new missing data, False otherwise. Defaults to True.
240
+ verbose (bool, optional): Verbosity level. Defaults to 0.
241
+ tol (float): Tolerance to reach proportion specified in self.prop_missing. Defaults to 1/num_snps*num_inds
242
+ max_tries (int): Maximum number of tries to reach targeted missing data proportion within specified tol. If None, num_inds will be used. Defaults to None.
243
+ seed (int | None): RNG seed.
244
+ logger (logging.Logger | None): Logger for messages.
245
+ """
230
246
  self.genotype_data = genotype_data
231
247
  self.tree_parser = tree_parser
232
248
  self.prop_missing = prop_missing
@@ -236,6 +252,10 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
236
252
  self.verbose = verbose
237
253
  self.tol = tol
238
254
  self.max_tries = max_tries
255
+ self.seed = seed
256
+ self.rng = (
257
+ np.random.default_rng(seed) if seed is not None else np.random.default_rng()
258
+ )
239
259
  self.logger = logger or logging.getLogger(__name__)
240
260
 
241
261
  def fit(self, X: np.ndarray, y=None) -> "SimMissingTransformer":
@@ -252,7 +272,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
252
272
  """
253
273
  X = np.asarray(validate_input_type(X, return_type="array")).astype("float32")
254
274
 
255
- self.logger.info(
275
+ self.logger.debug(
256
276
  f"Adding {self.prop_missing} missing data per column using strategy: {self.strategy}"
257
277
  )
258
278
 
@@ -267,7 +287,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
267
287
  self.mask_ = np.zeros_like(X, dtype=bool)
268
288
 
269
289
  # sample only over present sites
270
- draws = np.random.random(X.shape)
290
+ draws = self.rng.random(X.shape)
271
291
  self.mask_[present] = draws[present] < self.prop_missing
272
292
 
273
293
  if self.mask_missing:
@@ -301,7 +321,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
301
321
  self.logger.error(msg)
302
322
  raise TypeError(msg)
303
323
 
304
- rng = np.random.default_rng()
305
324
  skip_root = True
306
325
  weighted = self.strategy == "nonrandom_weighted"
307
326
 
@@ -365,7 +384,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
365
384
  tips_only=False,
366
385
  skip_root=skip_root,
367
386
  weighted=weighted,
368
- rng=rng,
387
+ rng=self.rng,
369
388
  )
370
389
  except ValueError:
371
390
  # no eligible nodes or no tips intersect samples; try again
@@ -380,7 +399,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
380
399
  cols_left = np.flatnonzero(col_quota > 0)
381
400
  if cols_left.size == 0:
382
401
  cols_left = np.arange(mask.shape[1])
383
- j = int(rng.choice(cols_left))
402
+ j = int(self.rng.choice(cols_left))
384
403
 
385
404
  # only edit eligible cells in this column
386
405
  eligible_rows = np.fromiter(
@@ -397,7 +416,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
397
416
  col_after = mask[present[:, j], j]
398
417
  if col_after.all():
399
418
  idx_present = np.flatnonzero(present[:, j])
400
- k = int(rng.choice(idx_present))
419
+ k = int(self.rng.choice(idx_present))
401
420
  mask[k, j] = False
402
421
 
403
422
  new_placed = int(mask.sum())
@@ -415,7 +434,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
415
434
  if col_idxs.size == 0:
416
435
  continue
417
436
  need = min(col_idxs.size, max(1, placed - target))
418
- to_clear = rng.choice(col_idxs, size=need, replace=False)
437
+ to_clear = self.rng.choice(col_idxs, size=need, replace=False)
419
438
  mask[to_clear, j] = False
420
439
 
421
440
  new_placed = int(mask.sum())
@@ -501,7 +520,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
501
520
  self.logger.error(msg)
502
521
  raise ValueError(msg)
503
522
 
504
- rng = np.random.default_rng() if rng is None else rng
505
523
  eps = 1e-12
506
524
 
507
525
  def _tf(arr: np.ndarray) -> np.ndarray:
@@ -538,9 +556,12 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
538
556
  probs[present & (col == c)] = pw
539
557
 
540
558
  if target_rate is not None:
541
- probs *= float(target_rate) # scale global intensity
559
+ mean_p = probs[present].mean()
560
+ if mean_p > 0:
561
+ probs *= float(target_rate) / mean_p
562
+ probs = np.clip(probs, 0.0, 1.0)
542
563
 
543
- draws = rng.random(n_samples)
564
+ draws = self.rng.random(n_samples)
544
565
  out_mask[:, j] = draws < probs
545
566
  out_mask[~present, j] = False # never alter already-missing
546
567
 
@@ -548,7 +569,7 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
548
569
  col_after = out_mask[present, j]
549
570
  if col_after.sum() == col_after.size:
550
571
  # clear a random observed index
551
- k = rng.integers(0, col_after.size)
572
+ k = self.rng.integers(0, col_after.size)
552
573
  out_mask[np.flatnonzero(present)[k], j] = False
553
574
 
554
575
  return out_mask
@@ -583,8 +604,6 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
583
604
  self.logger.error(msg)
584
605
  raise ValueError(msg)
585
606
 
586
- rng = np.random.default_rng() if rng is None else rng
587
-
588
607
  node_dict: dict[int | object, float] = {}
589
608
 
590
609
  if self.tree_parser is None or not hasattr(self.tree_parser, "tree"):
@@ -633,8 +652,8 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
633
652
  def _choose_key() -> object:
634
653
  if weighted and weights.sum() > 0.0:
635
654
  p = weights / weights.sum()
636
- return rng.choice(keys, p=p)
637
- return rng.choice(keys)
655
+ return self.rng.choice(keys, p=p)
656
+ return self.rng.choice(keys)
638
657
 
639
658
  tree = self.tree_parser.tree
640
659
  last_error: Optional[Exception] = None
@@ -698,17 +717,22 @@ class SimMissingTransformer(BaseEstimator, TransformerMixin):
698
717
  if col.size and col.all():
699
718
  # clear one random observed index
700
719
  idxs = np.flatnonzero(obs)
701
- k = np.random.randint(0, idxs.size)
720
+ k = self.rng.integers(0, idxs.size)
702
721
  self.mask_[idxs[k], j] = False
703
722
 
704
723
  def _mask_snps(self, X):
705
724
  """Mask positions in SimGenotypeData.snps and SimGenotypeData.onehot"""
706
- if len(X.shape) == 3:
725
+ if X.ndim == 3:
707
726
  # One-hot encoded.
708
727
  mask_val = [0.0, 0.0, 0.0, 0.0]
709
- elif len(X.shape) == 2:
728
+ elif X.ndim == 2:
710
729
  # 012-encoded.
711
- mask_val = -9
730
+ mask_val = (
731
+ float(self.missing_val)
732
+ if np.isnan(self.missing_val)
733
+ else self.missing_val
734
+ )
735
+
712
736
  else:
713
737
  raise ValueError(f"Invalid shape of input X: {X.shape}")
714
738