pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -1,1165 +1,183 @@
1
- import copy
2
- import os
1
+ # Standard library imports
3
2
  import logging
4
- import sys
5
- import warnings
6
-
7
- import numpy as np
8
- import pandas as pd
3
+ from typing import Literal
9
4
 
10
5
  # Third-party imports
11
6
  import numpy as np
12
- import pandas as pd
13
-
14
- from sklearn.base import BaseEstimator, TransformerMixin
15
- from sklearn.impute import SimpleImputer
16
- from sklearn.metrics import (
17
- roc_auc_score,
18
- precision_recall_fscore_support,
19
- average_precision_score,
20
- )
21
- from sklearn.preprocessing import label_binarize
22
-
23
- # Import tensorflow with reduced warnings.
24
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
25
- logging.getLogger("tensorflow").disabled = True
26
- warnings.filterwarnings("ignore", category=UserWarning)
27
-
28
- # noinspection PyPackageRequirements
29
- import tensorflow as tf
30
-
31
- # Disable can't find cuda .dll errors. Also turns of GPU support.
32
- tf.config.set_visible_devices([], "GPU")
33
-
34
- from tensorflow.python.util import deprecation
35
-
36
- # Disable warnings and info logs.
37
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
38
- tf.get_logger().setLevel(logging.ERROR)
39
-
40
-
41
- # Monkey patching deprecation utils to supress warnings.
42
- # noinspection PyUnusedLocal
43
- def deprecated(
44
- date, instructions, warn_once=True
45
- ): # pylint: disable=unused-argument
46
- def deprecated_wrapper(func):
47
- return func
48
-
49
- return deprecated_wrapper
50
-
51
-
52
- deprecation.deprecated = deprecated
53
-
54
- # Custom Modules
55
- try:
56
- from ..utils import misc
57
-
58
- except (ModuleNotFoundError, ValueError, ImportError):
59
- from pgsui.utils import misc
60
-
61
-
62
- # Pandas on pip gives a performance warning when doing the below code.
63
- # Apparently it's a bug that exists in the pandas version I used here.
64
- # It can be safely ignored.
65
- warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
66
-
67
-
68
- def encode_onehot(X):
69
- """Convert 012-encoded data to one-hot encodings.
70
- Args:
71
- X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
72
- Returns:
73
- pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan).
74
- """
75
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], 3))
76
- mappings = {
77
- 0: np.array([1, 0, 0]),
78
- 1: np.array([0, 1, 0]),
79
- 2: np.array([0, 0, 1]),
80
- -9: np.array([np.nan, np.nan, np.nan]),
81
- }
82
- for row in np.arange(X.shape[0]):
83
- Xt[row] = [mappings[enc] for enc in X[row]]
84
- return Xt
85
-
86
-
87
- def mle(row):
88
- """Get the Maximum Likelihood Estimation for the best prediction. Basically, it sets the index of the maxiumum value in a vector (row) to 1.0, since it is one-hot encoded.
89
-
90
- Args:
91
- row (numpy.ndarray(float)): Row vector with predicted values as floating points.
92
-
93
- Returns:
94
- numpy.ndarray(float): Row vector with the highest prediction set to 1.0 and the others set to 0.0.
95
- """
96
- res = np.zeros(row.shape[0])
97
- res[np.argmax(row)] = 1
98
- return res
99
-
100
-
101
- class UBPInputTransformer(BaseEstimator, TransformerMixin):
102
- """Transform input X prior to estimator fitting.
103
-
104
- Args:
105
- n_components (int): Number of principal components currently being used in V.
106
-
107
- V (numpy.ndarray or Dict[str, Any]): If doing grid search, should be a dictionary with current_component: numpy.ndarray. If not doing grid search, then it should be a numpy.ndarray.
108
- """
109
-
110
- def __init__(self, n_components, V):
111
- self.n_components = n_components
112
- self.V = V
113
-
114
- def fit(self, X):
115
- """Fit transformer to input data X.
116
-
117
- Args:
118
- X (numpy.ndarray): Input data to fit. If numpy.ndarray, then should be of shape (n_samples, n_components). If dictionary, then should be component: numpy.ndarray.
119
-
120
- Returns:
121
- self: Class instance.
122
- """
123
- self.n_features_in_ = self.n_components
124
- return self
125
-
126
- def transform(self, X):
127
- """Transform input data X to the needed format.
128
-
129
- Args:
130
- X (numpy.ndarray): Input data to fit. If numpy.ndarray, then should be of shape (n_samples, n_components). If dictionary, then should be component: numpy.ndarray.
131
-
132
- Returns:
133
- numpy.ndarray: Formatted input data with correct component.
134
-
135
- Raises:
136
- TypeError: V must be a dictionary if phase is None or phase == 1.
137
- TypeError: V must be a numpy array if phase is 2 or 3.
138
- """
139
- if not isinstance(self.V, dict):
140
- raise TypeError(f"V must be a dictionary, but got {type(self.V)}")
141
- return self.V[self.n_components]
142
7
 
143
8
 
144
- class AutoEncoderFeatureTransformer(BaseEstimator, TransformerMixin):
145
- """Transformer to format autoencoder features and targets before model fitting.
9
+ class SimGenotypeDataTransformer:
10
+ """Simulates missing genotypes at the locus level on a 2D integer matrix.
146
11
 
147
- The input data, X, is encoded to one-hot format, and then missing values are filled to [-1] * num_classes.
148
-
149
- Missing and observed boolean masks are also generated.
12
+ This transformer masks a proportion of known genotypes in the input matrix X, setting them to a specified missing value. The masking can be done randomly or based on inverse genotype frequencies, with an option to boost the likelihood of masking heterozygous genotypes.
150
13
 
151
14
  Args:
152
- num_classes (int, optional): The number of classes in the last axis dimention of the input array. Defaults to 3.
153
-
154
- return_int (bool, optional): Whether to return an integer-encoded array (If True) or a one-hot or multi-label encoded array (If False.). Defaults to False.
155
-
156
- activate (str or None, optional): If not None, then does the appropriate activation. Multilabel learning uses sigmoid activation, and multiclass uses softmax. If set to None, then the function assumes that the input has already been activated. Possible values include: {None, 'sigmoid', 'softmax'}. Defaults to None.
157
- """
158
-
159
- def __init__(self, num_classes=3, return_int=False, activate=None):
160
- self.num_classes = num_classes
161
- self.return_int = return_int
162
- self.activate = activate
163
-
164
- def fit(self, X, y=None):
165
- """set attributes used to transform X (input features).
166
-
167
- Args:
168
- X (numpy.ndarray): Input integer-encoded numpy array.
169
-
170
- y (None): Just for compatibility with sklearn API.
171
- """
172
- X = misc.validate_input_type(X, return_type="array")
173
-
174
- self.X_decoded = X
175
-
176
- # VAE uses 4 classes ([A,T,G,C]), SAE uses 3 ([0,1,2]).
177
- if self.num_classes == 3:
178
- enc_func = self.encode_012
179
- elif self.num_classes == 4:
180
- enc_func = self.encode_multilab
181
- elif self.num_classes == 10:
182
- enc_func = self.encode_multiclass
183
- else:
184
- raise ValueError(
185
- f"Invalid value passed to num_classes in "
186
- f"AutoEncoderFeatureTransformer. Only 3 or 4 are supported, "
187
- f"but got {self.num_classes}."
188
- )
189
-
190
- # Encode the data.
191
- self.X_train = enc_func(X)
192
- self.classes_ = np.arange(self.num_classes)
193
- self.n_classes_ = self.num_classes
194
-
195
- # Get missing and observed data boolean masks.
196
- self.missing_mask_, self.observed_mask_ = self._get_masks(self.X_train)
197
-
198
- # To accomodate multiclass-multioutput.
199
- self.n_outputs_expected_ = 1
200
-
201
- self.n_outputs_ = self.X_train.shape[1]
202
-
203
- return self
204
-
205
- def transform(self, X):
206
- """Transform X to one-hot encoded format.
207
-
208
- Accomodates multiclass targets with a 3D shape.
209
-
210
- Args:
211
- X (numpy.ndarray): One-hot encoded target data of shape (n_samples, n_features, num_classes).
212
-
213
- Returns:
214
- numpy.ndarray: Transformed target data in one-hot format of shape (n_samples, n_features, num_classes).
215
- """
216
- if self.return_int:
217
- return X
218
- else:
219
- # X = misc.validate_input_type(X, return_type="array")
220
- return self._fill(self.X_train, self.missing_mask_)
221
-
222
- def inverse_transform(self, y, return_proba=False):
223
- """Transform target to output format.
224
-
225
- Args:
226
- y (numpy.ndarray): Array to inverse transform.
227
-
228
- return_proba (bool): Just for compatibility with scikeras API.
229
- """
230
- try:
231
- if self.activate is None:
232
- y = y.numpy()
233
- elif self.activate == "softmax":
234
- y = tf.nn.softmax(y).numpy()
235
- elif self.activate == "sigmoid":
236
- y = tf.nn.sigmoid(y).numpy()
237
- else:
238
- raise ValueError(
239
- f"Invalid value passed to keyword argument activate. Valid "
240
- f"options include: None, 'softmax', or 'sigmoid', but got "
241
- f"{self.activate}"
242
- )
243
- except AttributeError:
244
- # If numpy array already.
245
- if self.activate is None:
246
- y = y.copy()
247
- elif self.activate == "softmax":
248
- y = tf.nn.softmax(tf.convert_to_tensor(y)).numpy()
249
- elif self.activate == "sigmoid":
250
- y = tf.nn.sigmoid(tf.convert_to_tensor(y)).numpy()
251
- else:
252
- raise ValueError(
253
- f"Invalid value passed to keyword argument activate. Valid "
254
- f"options include: None, 'softmax', or 'sigmoid', but got "
255
- f"{self.activate}"
256
- )
257
- return y
258
-
259
- def encode_012(self, X):
260
- """Convert 012-encoded data to one-hot encodings.
261
- Args:
262
- X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
263
- Returns:
264
- pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan).
265
- """
266
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], 3))
267
- mappings = {
268
- 0: np.array([1, 0, 0]),
269
- 1: np.array([0, 1, 0]),
270
- 2: np.array([0, 0, 1]),
271
- -9: np.array([np.nan, np.nan, np.nan]),
272
- }
273
- for row in np.arange(X.shape[0]):
274
- Xt[row] = [mappings[enc] for enc in X[row]]
275
- return Xt
276
-
277
- def encode_multilab(self, X, multilab_value=1.0):
278
- """Encode 0-9 integer data in multi-label one-hot format.
279
- Args:
280
- X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
281
-
282
- multilab_value (float): Value to use for multilabel target encodings. Defaults to 0.5.
283
- Returns:
284
- pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan). multi-label categories will be encoded as 0.5. Otherwise, it will be 1.0.
285
- """
286
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], 4))
287
- mappings = {
288
- 0: [1.0, 0.0, 0.0, 0.0],
289
- 1: [0.0, 1.0, 0.0, 0.0],
290
- 2: [0.0, 0.0, 1.0, 0.0],
291
- 3: [0.0, 0.0, 0.0, 1.0],
292
- 4: [multilab_value, multilab_value, 0.0, 0.0],
293
- 5: [multilab_value, 0.0, multilab_value, 0.0],
294
- 6: [multilab_value, 0.0, 0.0, multilab_value],
295
- 7: [0.0, multilab_value, multilab_value, 0.0],
296
- 8: [0.0, multilab_value, 0.0, multilab_value],
297
- 9: [0.0, 0.0, multilab_value, multilab_value],
298
- -9: [np.nan, np.nan, np.nan, np.nan],
299
- }
300
- for row in np.arange(X.shape[0]):
301
- Xt[row] = [mappings[enc] for enc in X[row]]
302
- return Xt
303
-
304
- def decode_multilab(self, X, multilab_value=1.0):
305
- """Decode one-hot format data back to 0-9 integer data.
306
-
307
- Args:
308
- X (numpy.ndarray): Input array with one-hot-encoded data.
309
-
310
- multilab_value (float): Value to use for multilabel target encodings. Defaults to 0.5.
311
-
312
- Returns:
313
- pandas.DataFrame: Decoded data, with multi-label categories decoded to their original integer representation.
314
- """
315
- Xt = np.zeros(shape=(X.shape[0], X.shape[1]))
316
- mappings = {
317
- tuple([1.0, 0.0, 0.0, 0.0]): 0,
318
- tuple([0.0, 1.0, 0.0, 0.0]): 1,
319
- tuple([0.0, 0.0, 1.0, 0.0]): 2,
320
- tuple([0.0, 0.0, 0.0, 1.0]): 3,
321
- tuple([multilab_value, multilab_value, 0.0, 0.0]): 4,
322
- tuple([multilab_value, 0.0, multilab_value, 0.0]): 5,
323
- tuple([multilab_value, 0.0, 0.0, multilab_value]): 6,
324
- tuple([0.0, multilab_value, multilab_value, 0.0]): 7,
325
- tuple([0.0, multilab_value, 0.0, multilab_value]): 8,
326
- tuple([0.0, 0.0, multilab_value, multilab_value]): 9,
327
- tuple([np.nan, np.nan, np.nan, np.nan]): -9,
328
- }
329
- for row in np.arange(X.shape[0]):
330
- Xt[row] = [mappings[tuple(enc)] for enc in X[row]]
331
- return Xt
332
-
333
- def encode_multiclass(self, X, num_classes=10, missing_value=-9):
334
- """Encode 0-9 integer data in multi-class one-hot format.
335
-
336
- Missing values get encoded as ``[np.nan] * num_classes``
337
- Args:
338
- X (numpy.ndarray): Input array with 012-encoded data and ``missing_value`` as the missing data value.
339
-
340
- num_classes (int, optional): Number of classes to use. Defaults to 10.
341
-
342
- missing_value (int, optional): Missing data value to replace with ``[np.nan] * num_classes``\. Defaults to -9.
343
- Returns:
344
- pandas.DataFrame: Multi-class one-hot encoded data, ignoring missing values (np.nan).
345
- """
346
- int_cats, ohe_arr = np.arange(num_classes), np.eye(num_classes)
347
- mappings = dict(zip(int_cats, ohe_arr))
348
- mappings[missing_value] = np.array([np.nan] * num_classes)
349
-
350
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], num_classes))
351
- for row in np.arange(X.shape[0]):
352
- Xt[row] = [mappings[enc] for enc in X[row]]
353
- return Xt
354
-
355
- def _fill(self, data, missing_mask, missing_value=-1):
356
- """Mask missing data as ``missing_value``\.
357
-
358
- Args:
359
- data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
360
-
361
- missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value.
362
-
363
- missing_value (int): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes.
364
- """
365
- if self.num_classes > 1:
366
- missing_value = [missing_value] * self.num_classes
367
- data[missing_mask] = missing_value
368
- return data
369
-
370
- def _get_masks(self, X):
371
- """Format the provided target data for use with UBP/NLPCA.
372
-
373
- Args:
374
- y (numpy.ndarray(float)): Input data that will be used as the target of shape (n_samples, n_features, num_classes).
375
-
376
- Returns:
377
- numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
378
-
379
- numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
380
- """
381
- missing_mask = self._create_missing_mask(X)
382
- observed_mask = ~missing_mask
383
- return missing_mask, observed_mask
384
-
385
- def _create_missing_mask(self, data):
386
- """Creates a missing data mask with boolean values.
387
- Args:
388
- data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
389
- Returns:
390
- numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
391
- """
392
- return np.isnan(data).all(axis=2)
393
-
394
-
395
- class MLPTargetTransformer(BaseEstimator, TransformerMixin):
396
- """Transformer to format UBP / NLPCA target data both before and after model fitting."""
397
-
398
- def fit(self, y):
399
- """Fit 012-encoded target data.
400
-
401
- Args:
402
- y (numpy.ndarray): Target data that is 012-encoded.
403
-
404
- Returns:
405
- self: Class instance.
406
- """
407
- y = misc.validate_input_type(y, return_type="array")
408
-
409
- # Original 012-encoded y
410
- self.y_decoded_ = y
411
-
412
- y_train = encode_onehot(y)
413
-
414
- # Get missing and observed data boolean masks.
415
- self.missing_mask_, self.observed_mask_ = self._get_masks(y_train)
416
-
417
- # To accomodate multiclass-multioutput.
418
- self.n_outputs_expected_ = 1
419
-
420
- return self
421
-
422
- def transform(self, y):
423
- """Transform y_true to one-hot encoded.
424
-
425
- Accomodates multiclass-multioutput targets.
426
-
427
- Args:
428
- y (numpy.ndarray): One-hot encoded target data.
429
-
430
- Returns:
431
- numpy.ndarray: y_true target data.
432
- """
433
- y = misc.validate_input_type(y, return_type="array")
434
- y_train = encode_onehot(y)
435
- return self._fill(y_train, self.missing_mask_)
436
-
437
- def inverse_transform(self, y):
438
- """Decode y_pred from one-hot to 012-based encoding.
439
-
440
- This allows sklearn.metrics to be used.
441
-
442
- Args:
443
- y (numpy.ndarray): One-hot encoded predicted probabilities after model fitting.
444
-
445
- Returns:
446
- numpy.ndarray: y predictions in same format as y_true.
447
- """
448
- # VAE has tuple output
449
- if isinstance(y, tuple):
450
- y = y[0]
451
-
452
- # Return predictions.
453
- return tf.nn.softmax(y).numpy()
454
-
455
- def _fill(self, data, missing_mask, missing_value=-1, num_classes=3):
456
- """Mask missing data as ``missing_value``\.
457
-
458
- Args:
459
- data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
460
-
461
- missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value.
462
-
463
- missing_value (int): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes. Defaults to -1.
464
-
465
- num_classes (int): Number of classes in dataset. Defaults to 3.
466
- """
467
- if num_classes > 1:
468
- missing_value = [missing_value] * num_classes
469
- data[missing_mask] = missing_value
470
- return data
471
-
472
- def _get_masks(self, X):
473
- """Format the provided target data for use with UBP/NLPCA.
474
-
475
- Args:
476
- X (numpy.ndarray(float)): Input data that will be used as the target.
477
-
478
- Returns:
479
- numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
480
-
481
- numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
482
- """
483
- missing_mask = self._create_missing_mask(X)
484
- observed_mask = ~missing_mask
485
- return missing_mask, observed_mask
486
-
487
- def _create_missing_mask(self, data):
488
- """Creates a missing data mask with boolean values.
489
- Args:
490
- data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
491
- Returns:
492
- numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
493
- """
494
- return np.isnan(data).all(axis=2)
495
-
496
- def _decode(self, y):
497
- """Evaluate UBP / NLPCA predictions by calculating the highest predicted value.
498
-
499
- Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
500
-
501
- Args:
502
- y (numpy.ndarray): Input one-hot encoded data.
503
-
504
- Returns:
505
- numpy.ndarray: Imputed one-hot encoded values.
506
- """
507
- Xprob = y
508
- Xt = np.apply_along_axis(mle, axis=2, arr=Xprob)
509
- Xpred = np.argmax(Xt, axis=2)
510
- Xtrue = np.argmax(y, axis=2)
511
- Xdecoded = np.zeros((Xpred.shape[0], Xpred.shape[1]))
512
- for idx in np.arange(Xdecoded):
513
- imputed_idx = np.where(self.observed_mask_[idx] == 0)
514
- known_idx = np.nonzero(self.observed_mask_[idx])
515
- Xdecoded[idx, imputed_idx] = Xpred[idx, imputed_idx]
516
- Xdecoded[idx, known_idx] = Xtrue[idx, known_idx]
517
- return Xdecoded.astype("int8")
518
-
519
-
520
- class UBPTargetTransformer(BaseEstimator, TransformerMixin):
521
- """Transformer to format UBP / NLPCA target data both before model fitting.
522
-
523
- Examples:
524
- >>>ubp_tt = UBPTargetTransformer()
525
- >>>y_train = ubp_tt.fit_transform(y)
526
- """
527
-
528
- def fit(self, y):
529
- """Fit 012-encoded target data.
530
-
531
- Args:
532
- y (numpy.ndarray): Target data that is 012-encoded, of shape (n_samples, n_features).
533
-
534
- Returns:
535
- self: Class instance.
536
- """
537
- y = misc.validate_input_type(y, return_type="array")
538
-
539
- # Original 012-encoded y
540
- self.y_decoded_ = y
541
-
542
- # One-hot encode y.
543
- y_train = encode_onehot(y)
544
-
545
- # Get missing and observed data boolean masks.
546
- self.missing_mask_, self.observed_mask_ = self._get_masks(y_train)
547
-
548
- # To accomodate multiclass-multioutput.
549
- self.n_outputs_expected_ = 1
550
-
551
- return self
552
-
553
- def transform(self, y):
554
- """Transform 012-encoded target to one-hot encoded format.
555
-
556
- Accomodates multiclass-multioutput targets.
557
-
558
- Args:
559
- y (numpy.ndarray): One-hot encoded target data of shape (n_samples, n_features).
560
-
561
- Returns:
562
- numpy.ndarray: y_true target data.
563
- """
564
- y = misc.validate_input_type(y, return_type="array")
565
- y_train = encode_onehot(y)
566
- return self._fill(y_train, self.missing_mask_)
567
-
568
- def inverse_transform(self, y):
569
- """Decode y_predicted from one-hot to 012-integer encoding.
570
-
571
- Performs a softmax activation for multiclass classification.
572
-
573
- This allows sklearn.metrics to be used.
574
-
575
- Args:
576
- y (numpy.ndarray): One-hot encoded predicted probabilities after model fitting, of shape (n_samples, n_features, num_classes).
577
-
578
- Returns:
579
- numpy.ndarray: y predictions in same format as y_true (n_samples, n_features).
580
- """
581
- return tf.nn.softmax(y).numpy()
582
-
583
- def _fill(self, data, missing_mask, missing_value=-1, num_classes=3):
584
- """Mask missing data as ``missing_value``\.
585
-
586
- Args:
587
- data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
588
-
589
- missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value, of shape (n_samples, n_features).
590
-
591
- missing_value (int, optional): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes. Defaults to -1.
592
-
593
- num_classes (int, optional): Number of classes to use. Defaults to 3.
594
- """
595
- if num_classes > 1:
596
- missing_value = [missing_value] * num_classes
597
- data[missing_mask] = missing_value
598
- return data
599
-
600
- def _get_masks(self, y):
601
- """Format the provided target data for use with UBP/NLPCA models.
602
-
603
- Args:
604
- y (numpy.ndarray(float)): Input data that will be used as the target of shape (n_samples, n_features, num_classes).
605
-
606
- Returns:
607
- numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
608
-
609
- numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
610
- """
611
- missing_mask = self._create_missing_mask(y)
612
- observed_mask = ~missing_mask
613
- return missing_mask, observed_mask
614
-
615
- def _create_missing_mask(self, data):
616
- """Creates a missing data mask with boolean values.
617
-
618
- Args:
619
- data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
620
-
621
- Returns:
622
- numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
623
- """
624
- return np.isnan(data).all(axis=2)
625
-
626
- def _decode(self, y):
627
- """Evaluate UBP/NLPCA predictions by calculating the argmax.
628
-
629
- Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
630
-
631
- Args:
632
- y (numpy.ndarray): Input one-hot encoded data of shape (n_samples, n_features, num_classes).
633
-
634
- Returns:
635
- numpy.ndarray: Imputed one-hot encoded values.
636
- """
637
- Xprob = y
638
- Xt = np.apply_along_axis(mle, axis=2, arr=Xprob)
639
- Xpred = np.argmax(Xt, axis=2)
640
- Xtrue = np.argmax(y, axis=2)
641
- Xdecoded = np.zeros((Xpred.shape[0], Xpred.shape[1]))
642
- for idx in np.arange(Xdecoded):
643
- imputed_idx = np.where(self.observed_mask_[idx] == 0)
644
- known_idx = np.nonzero(self.observed_mask_[idx])
645
- Xdecoded[idx, imputed_idx] = Xpred[idx, imputed_idx]
646
- Xdecoded[idx, known_idx] = Xtrue[idx, known_idx]
647
- return Xdecoded.astype("int8")
648
-
649
-
650
- class SimGenotypeDataTransformer(BaseEstimator, TransformerMixin):
651
- """Simulate missing data on genotypes read/ encoded in a GenotypeData object.
652
-
653
- Copies metadata from a GenotypeData object and simulates user-specified proportion of missing data
654
-
655
- Args:
656
- genotype_data (GenotypeData object): GenotypeData instance.
657
-
658
- prop_missing (float, optional): Proportion of missing data desired in output. Defaults to 0.1
659
-
660
- strategy (str, optional): Strategy for simulating missing data. May be one of: "nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", or "random". When set to "nonrandom", branches from GenotypeData.guidetree will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). Defaults to "random"
661
-
662
- missing_val (int, optional): Value that represents missing data. Defaults to -9.
663
-
664
- mask_missing (bool, optional): True if you want to skip original missing values when simulating new missing data, False otherwise. Defaults to True.
665
-
666
- verbose (bool, optional): Verbosity level. Defaults to 0.
667
-
668
- tol (float): Tolerance to reach proportion specified in self.prop_missing. Defaults to 1/num_snps*num_inds
669
-
670
- max_tries (int): Maximum number of tries to reach targeted missing data proportion within specified tol. If None, num_inds will be used. Defaults to None.
671
-
672
- Attributes:
673
-
674
- original_missing_mask_ (numpy.ndarray): Array with boolean mask for original missing locations.
675
-
676
- simulated_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
677
-
678
- all_missing_mask_ (numpy.ndarray): Array with boolean mask for all missing locations, including both simulated and original.
679
-
680
- Properties:
681
- missing_count (int): Number of genotypes masked by chosen missing data strategy
682
-
683
- prop_missing_real (float): True proportion of missing data generated using chosen strategy
684
-
685
- mask (numpy.ndarray): 2-dimensional array tracking the indices of sampled missing data sites (n_samples, n_sites)
15
+ prop_missing (float): Proportion of *known* loci to mask (0..1).
16
+ strategy (Literal): Strategy name.
17
+ missing_val (int): Missing code value (default: -9).
18
+ seed (int | None): RNG seed.
19
+ logger (logging.Logger | None): Logger for messages.
20
+ het_boost (float): Multiplier for heterozygotes in inv-genotype mode.
686
21
  """
687
22
 
688
23
  def __init__(
689
24
  self,
690
- genotype_data,
691
25
  *,
692
- prop_missing=0.1,
693
- strategy="random",
694
- missing_val=-9,
695
- mask_missing=True,
696
- verbose=0,
697
- tol=None,
698
- max_tries=None,
699
- ) -> None:
700
- self.genotype_data = genotype_data
701
- self.prop_missing = prop_missing
26
+ prop_missing: float = 0.1,
27
+ strategy: Literal["random", "random_inv_genotype"] = "random",
28
+ missing_val: int = -1,
29
+ seed: int | None = None,
30
+ logger: logging.Logger | None = None,
31
+ het_boost: float = 1.0,
32
+ ):
33
+ self.prop_missing = float(prop_missing)
702
34
  self.strategy = strategy
703
- self.missing_val = missing_val
704
- self.mask_missing = mask_missing
705
- self.verbose = verbose
706
- self.tol = tol
707
- self.max_tries = max_tries
708
-
709
- def fit(self, X):
710
- """Fit to input data X by simulating missing data.
35
+ self.missing_val = int(missing_val)
36
+ self.seed = seed
37
+ self.rng = np.random.default_rng(seed)
38
+ self.het_boost = float(het_boost)
39
+ self.logger = logger or logging.getLogger(__name__)
711
40
 
712
- Missing data will be simulated in varying ways depending on the ``strategy`` setting.
41
+ def fit(self, X, y=None) -> "SimGenotypeDataTransformer":
42
+ """Stateless.
713
43
 
714
44
  Args:
715
- X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Data with which to simulate missing data. It should have already been imputed with one of the non-machine learning simple imputers, and there should be no missing data present in X.
716
-
717
- Raises:
718
- TypeError: SimGenotypeData.tree must not be NoneType when using strategy="nonrandom" or "nonrandom_weighted".
719
-
720
- ValueError: Invalid ``strategy`` parameter provided.
45
+ X (np.ndarray): (n_samples, n_features), integer codes {0..9} or <0 as missing.
46
+ y: Ignored.
721
47
  """
722
- X = misc.validate_input_type(X, return_type="array").astype("float32")
723
-
724
- if self.verbose > 0:
725
- print(
726
- f"\nAdding {self.prop_missing} missing data per column "
727
- f"using strategy: {self.strategy}"
728
- )
729
-
730
- if np.all(np.isnan(np.array([self.missing_val])) == False):
731
- X[X == self.missing_val] = np.nan
732
-
733
- self.original_missing_mask_ = np.isnan(X)
734
-
735
- if self.strategy == "random":
736
- if self.mask_missing:
737
- # Get indexes where non-missing (Xobs) and missing (Xmiss).
738
- Xobs = np.where(~self.original_missing_mask_.ravel())[0]
739
- Xmiss = np.where(self.original_missing_mask_.ravel())[0]
740
-
741
- # Generate mask of 0's (non-missing) and 1's (missing).
742
- obs_mask = np.random.choice(
743
- [0, 1],
744
- size=Xobs.size,
745
- p=((1 - self.prop_missing), self.prop_missing),
746
- ).astype(bool)
747
-
748
- # Make missing data mask.
749
- mask = np.zeros(X.size)
750
- mask[Xobs] = obs_mask
751
- mask[Xmiss] = 1
752
-
753
- # Reshape from raveled to 2D.
754
- # With strategy=="random", mask_ is equal to all_missing_.
755
- self.mask_ = np.reshape(mask, X.shape)
756
-
757
- else:
758
- # Generate mask of 0's (non-missing) and 1's (missing).
759
- self.mask_ = np.random.choice(
760
- [0, 1],
761
- size=X.shape,
762
- p=((1 - self.prop_missing), self.prop_missing),
763
- ).astype(bool)
764
-
765
- # Make sure no entirely missing columns were simulated.
766
- self._validate_mask()
767
-
768
- elif self.strategy == "random_weighted":
769
- self.mask_ = self.random_weighted_missing_data(X, inv=False)
770
-
771
- elif self.strategy == "random_weighted_inv":
772
- self.mask_ = self.random_weighted_missing_data(X, inv=True)
773
-
774
- elif (
775
- self.strategy == "nonrandom"
776
- or self.strategy == "nonrandom_weighted"
777
- ):
778
- if self.genotype_data.tree is None:
779
- raise TypeError(
780
- "SimGenotypeData.tree cannot be NoneType when "
781
- "strategy='nonrandom' or 'nonrandom_weighted'"
782
- )
783
-
784
- mask = np.full_like(X, 0.0, dtype=bool)
785
-
786
- if self.strategy == "nonrandom_weighted":
787
- weighted = True
788
- else:
789
- weighted = False
790
-
791
- sample_map = dict()
792
- for i, sample in enumerate(self.genotype_data.samples):
793
- sample_map[sample] = i
794
-
795
- # if no tolerance provided, set to 1 snp position
796
- if self.tol is None:
797
- self.tol = 1.0 / mask.size
798
-
799
- # if no max_tries provided, set to # inds
800
- if self.max_tries is None:
801
- self.max_tries = mask.shape[0]
802
-
803
- filled = False
804
- while not filled:
805
- # Get list of samples from tree
806
- samples = self._sample_tree(
807
- internal_only=False, skip_root=True, weighted=weighted
808
- )
809
-
810
- # Convert to row indices
811
- rows = [sample_map[i] for i in samples]
812
-
813
- # Randomly sample a column
814
- col_idx = np.random.randint(0, mask.shape[1])
815
- sampled_col = copy.copy(mask[:, col_idx])
816
- miss_mask = copy.copy(self.original_missing_mask_[:, col_idx])
817
-
818
- # Mask column
819
- sampled_col[rows] = True
820
-
821
- # If original was missing, set back to False.
822
- if self.mask_missing:
823
- sampled_col[miss_mask] = False
824
-
825
- # check that column is not 100% missing now
826
- # if yes, sample again
827
- if np.sum(sampled_col) == sampled_col.size:
828
- continue
829
-
830
- # if not, set values in mask matrix
831
- else:
832
- mask[:, col_idx] = sampled_col
833
-
834
- # if this addition pushes missing % > self.prop_missing,
835
- # check previous prop_missing, remove masked samples from
836
- # this column until closest to target prop_missing
837
- current_prop = np.sum(mask) / mask.size
838
- if abs(current_prop - self.prop_missing) <= self.tol:
839
- filled = True
840
- break
841
- elif current_prop > self.prop_missing:
842
- tries = 0
843
- while (
844
- abs(current_prop - self.prop_missing) > self.tol
845
- and tries < self.max_tries
846
- ):
847
- r = np.random.randint(0, mask.shape[0])
848
- c = np.random.randint(0, mask.shape[1])
849
- mask[r, c] = False
850
- tries += 1
851
- current_prop = np.sum(mask) / mask.size
852
-
853
- filled = True
854
- else:
855
- continue
856
-
857
- # With strategy=="nonrandom" or "nonrandom_weighted",
858
- # mask_ is equal to sim_missing_mask_ if mask_missing is True.
859
- # Otherwise it is equal to all_missing_.
860
- self.mask_ = mask
861
-
862
- self._validate_mask()
863
-
864
- else:
865
- raise ValueError(
866
- "Invalid SimGenotypeData.strategy value:", self.strategy
867
- )
868
-
869
- # Get all missing values.
870
- self.all_missing_mask_ = np.logical_or(
871
- self.mask_, self.original_missing_mask_
872
- )
873
- # Get values where original value was not missing and simulated.
874
- # data is missing.
875
- self.sim_missing_mask_ = np.logical_and(
876
- self.all_missing_mask_, self.original_missing_mask_ == False
877
- )
878
-
879
- self._validate_mask(mask=self.mask_missing)
880
-
881
48
  return self
882
49
 
883
- def transform(self, X):
884
- """Function to generate masked sites in a SimGenotypeData object
50
+ def transform(self, X: np.ndarray) -> tuple[np.ndarray, dict]:
51
+ """Apply missing-data simulation on a 2D genotype matrix.
885
52
 
886
53
  Args:
887
- X (pandas.DataFrame, numpy.ndarray, or List[List[int]]): Data to transform. No missing data should be present in X. It should have already been imputed with one of the non-machine learning simple imputers.
54
+ X (np.ndarray): (n_samples, n_features), integer codes {0..9} or <0 as missing.
888
55
 
889
56
  Returns:
890
- numpy.ndarray: Transformed data with missing data added.
57
+ tuple[np.ndarray, dict]: (X_masked, masks) where masks has keys: 'original': original missing (boolean 2D). 'simulated': loci masked here (boolean 2D). 'all': union of original + simulated (boolean 2D)
891
58
  """
892
- X = misc.validate_input_type(X, return_type="array")
893
-
894
- # mask 012-encoded and one-hot encoded genotypes.
895
- return self._mask_snps(X)
896
-
897
- def accuracy(self, X_true, X_pred):
898
- """Calculate imputation accuracy of the simulated genotypes.
59
+ if X.ndim != 2:
60
+ msg = f"X must be 2D, got shape {X.shape}"
61
+ self.logger.error(msg)
62
+ raise ValueError(msg)
899
63
 
900
- Args:
901
- X_true (np.ndarray): True values.
902
-
903
- X_pred (np.ndarray): Imputed values.
64
+ X = np.asarray(X)
65
+ original_mask = X < 0
904
66
 
905
- Returns:
906
- float: Accuracy score between X_true and X_pred.
907
- '"""
908
- masked_sites = np.sum(self.sim_missing_mask_)
909
- num_correct = np.sum(
910
- X_true[self.sim_missing_mask_] == X_pred[self.sim_missing_mask_]
911
- )
912
- return num_correct / masked_sites
67
+ sim_mask = self._simulate_missing_mask(X, original_mask)
68
+ sim_mask = sim_mask & (~original_mask)
69
+ sim_mask = self._validate_mask(sim_mask)
913
70
 
914
- def auc_roc_pr_ap(self, X_true, X_pred):
915
- """Calcuate AUC-ROC, Precision-Recall, and Average Precision (AP).
916
-
917
- Args:
918
- X_true (np.ndarray): True values.
919
-
920
- X_pred (np.ndarray): Imputed values.
921
-
922
- Returns:
923
- List[float]: List of AUC-ROC scores in order of: 0,1,2.
924
- List[float]: List of precision scores in order of: 0,1,2.
925
- List[float]: List of recall scores in order of: 0,1,2.
926
- List[float]: List of average precision scores in order of 0,1,2.
927
-
928
- """
929
- y_true = X_true[self.sim_missing_mask_]
930
- y_pred = X_pred[self.sim_missing_mask_]
931
-
932
- # Binarize the output
933
- y_true_bin = label_binarize(y_true, classes=[0, 1, 2])
934
- y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2])
935
-
936
- # Initialize lists to hold the scores for each class
937
- auc_roc_scores = []
938
- precision_scores = []
939
- recall_scores = []
940
- avg_precision_scores = []
941
-
942
- for i in range(y_true_bin.shape[1]):
943
- # AUC-ROC score
944
- auc_roc = roc_auc_score(
945
- y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
946
- )
947
- auc_roc_scores.append(auc_roc)
948
-
949
- # Precision-recall score
950
- precision, recall, _, _ = precision_recall_fscore_support(
951
- y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
952
- )
953
- precision_scores.append(precision)
954
- recall_scores.append(recall)
955
-
956
- # Average precision score
957
- avg_precision = average_precision_score(
958
- y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
959
- )
960
- avg_precision_scores.append(avg_precision)
71
+ all_mask = original_mask | sim_mask
72
+ Xt = X.copy()
73
+ Xt[all_mask] = self.missing_val
961
74
 
962
- return (
963
- auc_roc_scores,
964
- precision_scores,
965
- recall_scores,
966
- avg_precision_scores,
967
- )
75
+ masks = {"original": original_mask, "simulated": sim_mask, "all": all_mask}
76
+ return Xt, masks
968
77
 
969
- def random_weighted_missing_data(self, X, inv=False):
970
- """Choose values for which to simulate missing data by biasing towards the minority or majority alleles, depending on whether inv is True or False.
78
+ # ---- strategies ----
79
+ def _simulate_missing_mask(
80
+ self, X: np.ndarray, original_mask: np.ndarray
81
+ ) -> np.ndarray:
82
+ """Simulate missingness mask based on the chosen strategy.
971
83
 
972
84
  Args:
973
- X (np.ndarray): True values.
974
-
975
- inv (bool, optional): If True, then biases towards choosing majority alleles. If False, then generates a stratified random sample (class proportions ~= full dataset) Defaults to False.
85
+ X (np.ndarray): Input genotype matrix.
86
+ original_mask (np.ndarray): Boolean mask of original missing values.
976
87
 
977
88
  Returns:
978
- np.ndarray: X with simulated missing values.
979
-
89
+ np.ndarray: Simulated missing mask.
980
90
  """
981
- # Get unique classes and their counts
982
- classes, counts = np.unique(X, return_counts=True)
983
- # Compute class weights
984
- if inv:
985
- class_weights = 1 / counts
986
- else:
987
- class_weights = counts
988
- # Normalize class weights
989
- class_weights = class_weights / sum(class_weights)
990
-
991
- # Compute mask
992
- if self.mask_missing:
993
- # Get indexes where non-missing (Xobs) and missing (Xmiss)
994
- Xobs = np.where(~self.original_missing_mask_.ravel())[0]
995
- Xmiss = np.where(self.original_missing_mask_.ravel())[0]
91
+ if self.strategy == "random":
92
+ return self._simulate_random(original_mask)
93
+ elif self.strategy == "random_inv_genotype":
94
+ return self._simulate_inv_genotype(X, original_mask)
996
95
 
997
- # Generate mask of 0's (non-missing) and 1's (missing)
998
- obs_mask = np.random.choice(
999
- classes, size=Xobs.size, p=class_weights
1000
- )
1001
- obs_mask = (obs_mask == classes[:, None]).argmax(axis=0)
96
+ msg = "strategy must be one of {'random','random_inv_genotype'}"
97
+ self.logger.error(msg)
98
+ raise ValueError(msg)
1002
99
 
1003
- # Make missing data mask
1004
- mask = np.zeros(X.size, dtype=bool)
1005
- mask[Xobs] = obs_mask
1006
- mask[Xmiss] = 1
100
+ def _simulate_random(self, original_mask: np.ndarray) -> np.ndarray:
101
+ rows, cols = np.where(~original_mask)
102
+ n_known = len(rows)
103
+ mask = np.zeros_like(original_mask, dtype=bool)
1007
104
 
1008
- # Reshape from raveled to 2D
1009
- mask = mask.reshape(X.shape)
1010
- else:
1011
- # Generate mask of 0's (non-missing) and 1's (missing)
1012
- mask = np.random.choice(classes, size=X.size, p=class_weights)
1013
- mask = (mask == classes[:, None]).argmax(axis=0).reshape(X.shape)
105
+ if n_known == 0:
106
+ return mask
1014
107
 
1015
- # Assign mask to self before validation
1016
- self.mask_ = mask
108
+ n_to_mask = int(np.floor(self.prop_missing * n_known))
1017
109
 
1018
- self._validate_mask()
110
+ if n_to_mask <= 0:
111
+ return mask
1019
112
 
113
+ idx = self.rng.choice(n_known, size=n_to_mask, replace=False)
114
+ mask[rows[idx], cols[idx]] = True
1020
115
  return mask
1021
116
 
1022
- def _sample_tree(
1023
- self,
1024
- internal_only=False,
1025
- tips_only=False,
1026
- skip_root=True,
1027
- weighted=False,
1028
- ):
1029
- """Function for randomly sampling clades from SimGenotypeData.tree.
117
+ def _simulate_inv_genotype(
118
+ self, X: np.ndarray, original_mask: np.ndarray
119
+ ) -> np.ndarray:
120
+ """Simulate missingness mask inversely proportional to genotype frequencies.
1030
121
 
1031
122
  Args:
1032
- internal_only (bool): Only sample from NON-TIPS. Defaults to False.
1033
-
1034
- tips_only (bool): Only sample from tips. Defaults to False.
1035
-
1036
- skip_root (bool): Exclude sampling of root node. Defaults to True.
1037
-
1038
- weighted (bool): Weight sampling by branch length. Defaults to False.
123
+ X (np.ndarray): Input genotype matrix.
124
+ original_mask (np.ndarray): Boolean mask of original missing values.
1039
125
 
1040
126
  Returns:
1041
- List[str]: List of descendant tips from the sampled node.
1042
-
1043
- Raises:
1044
- ValueError: ``tips_only`` and ``internal_only`` cannot both be True.
127
+ np.ndarray: Simulated missing mask. 0..3: homozygous (0,1,2,3). 4..9: heterozygous (0/1,0/2,0/3,1/2,1/3,2/3).
1045
128
  """
1046
129
 
1047
- if tips_only and internal_only:
1048
- raise ValueError("internal_only and tips_only cannot both be true")
130
+ rows, cols = np.where(~original_mask)
131
+ n_known = len(rows)
132
+ mask = np.zeros_like(original_mask, dtype=bool)
133
+ if n_known == 0:
134
+ return mask
1049
135
 
1050
- # to only sample internal nodes add if not i.is_leaf()
1051
- node_dict = dict()
136
+ # Global genotype frequencies (0..9) from all known
137
+ vals = X[~original_mask].astype(int)
138
+ vals = vals[(vals >= 0) & (vals < 10)]
139
+ if vals.size == 0:
140
+ return self._simulate_random(original_mask)
1052
141
 
1053
- for node in self.genotype_data.tree.treenode.traverse("preorder"):
1054
- ## node.idx is node indexes.
1055
- ## node.dist is branch lengths.
1056
- if skip_root:
1057
- # If root node.
1058
- if node.idx == self.genotype_data.tree.nnodes - 1:
1059
- continue
142
+ cnt = np.bincount(vals, minlength=10).astype(float)
143
+ freqs = cnt / (cnt.sum() + 1e-12)
1060
144
 
1061
- if tips_only and internal_only:
1062
- raise ValueError(
1063
- "tips_only and internal_only cannot both be True"
1064
- )
145
+ # Candidate weights
146
+ geno_known = X[rows, cols].astype(int) # (n_known,)
147
+ inv = 1.0 / (freqs[geno_known] + 1e-12)
1065
148
 
1066
- if tips_only:
1067
- if not node.is_leaf():
1068
- continue
1069
- elif internal_only:
1070
- if node.is_leaf():
1071
- continue
1072
- node_dict[node.idx] = node.dist
1073
- if weighted:
1074
- s = sum(list(node_dict.values()))
1075
- # Node index / sum of node distances.
1076
- p = [i / s for i in list(node_dict.values())]
1077
- node_idx = np.random.choice(list(node_dict.keys()), size=1, p=p)[0]
1078
- else:
1079
- # Get missing choice from random clade.
1080
- node_idx = np.random.choice(list(node_dict.keys()), size=1)[0]
1081
- return self.genotype_data.tree.get_tip_labels(idx=node_idx)
149
+ # Optional het boost (heterozygous codes are 4..9)
150
+ if self.het_boost != 1.0:
151
+ is_het = (geno_known >= 4) & (geno_known <= 9)
152
+ inv = inv * np.where(is_het, self.het_boost, 1.0)
1082
153
 
1083
- def _validate_mask(self, mask=False):
1084
- """Make sure no entirely missing columns are simulated."""
1085
- if mask is None:
1086
- mask = self.mask_
1087
- for i, column in enumerate(self.mask_.T):
1088
- if mask:
1089
- miss_mask = self.original_missing_mask_[:, i]
1090
- col = column[~miss_mask]
1091
- obs_idx = np.where(~miss_mask)
1092
- idx = obs_idx[np.random.choice(np.arange(len(obs_idx)))]
1093
- else:
1094
- col = column
1095
- idx = np.random.choice(np.arange(col.shape[0]))
1096
- if np.sum(col) == col.size:
1097
- self.mask_[idx, i] = False
154
+ n_to_mask = int(np.floor(self.prop_missing * n_known))
155
+ if n_to_mask <= 0:
156
+ return mask
1098
157
 
1099
- def _mask_snps(self, X):
1100
- """Mask positions in SimGenotypeData.snps and SimGenotypeData.onehot"""
1101
- if len(X.shape) == 3:
1102
- # One-hot encoded.
1103
- mask_val = [0.0, 0.0, 0.0, 0.0]
1104
- elif len(X.shape) == 2:
1105
- # 012-encoded.
1106
- mask_val = -9
1107
- else:
1108
- raise ValueError(f"Invalid shape of input X: {X.shape}")
1109
-
1110
- Xt = X.copy()
1111
- mask_boolean = self.mask_ != 0
1112
- Xt[mask_boolean] = mask_val
1113
- return Xt
1114
-
1115
- def write_mask(self, filename_prefix):
1116
- """Write mask to file.
1117
-
1118
- Args:
1119
- filename_prefix (str): Prefix for the filenames to write to.
1120
- """
1121
- np.save(filename_prefix + "_mask.npy", self.mask_)
1122
- np.save(filename_prefix + "_original_missing_mask.npy", self.original_missing_mask_)
158
+ probs = inv / (inv.sum() + 1e-12)
159
+ idx = self.rng.choice(n_known, size=n_to_mask, replace=False, p=probs)
160
+ mask[rows[idx], cols[idx]] = True
161
+ return mask
1123
162
 
1124
- def read_mask(self, filename_prefix):
1125
- """Read mask from file.
163
+ def _validate_mask(self, mask: np.ndarray) -> np.ndarray:
164
+ """Avoid fully-masked rows/columns.
1126
165
 
1127
166
  Args:
1128
- filename_prefix (str): Prefix for the filenames to read from.
1129
-
1130
- Returns:
1131
- tuple of np.ndarray: The read masks.
1132
- """
1133
- # Check if files exist
1134
- if not os.path.isfile(filename_prefix + "_mask.npy"):
1135
- raise FileNotFoundError(filename_prefix + "_mask.npy" + " does not exist.")
1136
- if not os.path.isfile(filename_prefix + "_original_missing_mask.npy"):
1137
- raise FileNotFoundError(filename_prefix + "_original_missing_mask.npy" + " does not exist.")
1138
-
1139
- # Load mask from file
1140
- self.mask_ = np.load(filename_prefix + "_mask.npy")
1141
- self.original_missing_mask_ = np.load(filename_prefix + "_original_missing_mask.npy")
1142
-
1143
- # Recalculate all_missing_mask_ from mask_ and original_missing_mask_
1144
- self.all_missing_mask_ = np.logical_or(self.mask_, self.original_missing_mask_)
1145
-
1146
- return self.mask_, self.original_missing_mask_, self.all_missing_mask_
1147
-
1148
-
1149
- @property
1150
- def missing_count(self) -> int:
1151
- """Count of masked genotypes in SimGenotypeData.mask
167
+ mask (np.ndarray): Input boolean mask.
1152
168
 
1153
169
  Returns:
1154
- int: Integer count of masked alleles.
170
+ np.ndarray: Validated mask.
1155
171
  """
1156
- return np.sum(self.mask_)
1157
-
1158
- @property
1159
- def prop_missing_real(self) -> float:
1160
- """Proportion of genotypes masked in SimGenotypeData.mask
1161
-
1162
- Returns:
1163
- float: Total number of masked alleles divided by SNP matrix size.
1164
- """
1165
- return np.sum(self.mask_) / self.mask_.size
172
+ rng = self.rng
173
+ # columns
174
+ full_cols = np.where(mask.all(axis=0))[0]
175
+ for c in full_cols:
176
+ r = int(rng.integers(0, mask.shape[0]))
177
+ mask[r, c] = False
178
+ # rows
179
+ full_rows = np.where(mask.all(axis=1))[0]
180
+ for r in full_rows:
181
+ c = int(rng.integers(0, mask.shape[1]))
182
+ mask[r, c] = False
183
+ return mask