pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pg-sui might be problematic. Click here for more details.
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
- pg_sui-1.6.8.dist-info/RECORD +78 -0
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
- pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
- pg_sui-1.6.8.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +635 -0
- pgsui/data_processing/config.py +576 -0
- pgsui/data_processing/containers.py +1782 -0
- pgsui/data_processing/transformers.py +121 -1103
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +189 -0
- pgsui/electron/app/package-lock.json +6893 -0
- pgsui/electron/app/package.json +50 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +146 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +130 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +59 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
- pgsui/impute/deterministic/imputers/mode.py +679 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +971 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
- pgsui/impute/supervised/base.py +339 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
- pgsui/impute/supervised/imputers/random_forest.py +287 -0
- pgsui/impute/unsupervised/base.py +924 -0
- pgsui/impute/unsupervised/callbacks.py +89 -263
- pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
- pgsui/impute/unsupervised/imputers/vae.py +957 -0
- pgsui/impute/unsupervised/loss_functions.py +158 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
- pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
- pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
- pgsui/impute/unsupervised/models/vae_model.py +259 -618
- pgsui/impute/unsupervised/nn_scorers.py +215 -0
- pgsui/utils/classification_viz.py +591 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +514 -824
- pgsui/utils/scorers.py +212 -438
- pg_sui-1.0.2.1.dist-info/RECORD +0 -75
- pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -735
- pgsui/impute/impute.py +0 -1486
- pgsui/impute/simple_imputers.py +0 -1439
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
- pgsui/impute/unsupervised/keras_classifiers.py +0 -702
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -297
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -214
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
- /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
|
@@ -1,1165 +1,183 @@
|
|
|
1
|
-
|
|
2
|
-
import os
|
|
1
|
+
# Standard library imports
|
|
3
2
|
import logging
|
|
4
|
-
import
|
|
5
|
-
import warnings
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
3
|
+
from typing import Literal
|
|
9
4
|
|
|
10
5
|
# Third-party imports
|
|
11
6
|
import numpy as np
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
|
-
from sklearn.base import BaseEstimator, TransformerMixin
|
|
15
|
-
from sklearn.impute import SimpleImputer
|
|
16
|
-
from sklearn.metrics import (
|
|
17
|
-
roc_auc_score,
|
|
18
|
-
precision_recall_fscore_support,
|
|
19
|
-
average_precision_score,
|
|
20
|
-
)
|
|
21
|
-
from sklearn.preprocessing import label_binarize
|
|
22
|
-
|
|
23
|
-
# Import tensorflow with reduced warnings.
|
|
24
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
25
|
-
logging.getLogger("tensorflow").disabled = True
|
|
26
|
-
warnings.filterwarnings("ignore", category=UserWarning)
|
|
27
|
-
|
|
28
|
-
# noinspection PyPackageRequirements
|
|
29
|
-
import tensorflow as tf
|
|
30
|
-
|
|
31
|
-
# Disable can't find cuda .dll errors. Also turns of GPU support.
|
|
32
|
-
tf.config.set_visible_devices([], "GPU")
|
|
33
|
-
|
|
34
|
-
from tensorflow.python.util import deprecation
|
|
35
|
-
|
|
36
|
-
# Disable warnings and info logs.
|
|
37
|
-
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|
38
|
-
tf.get_logger().setLevel(logging.ERROR)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# Monkey patching deprecation utils to supress warnings.
|
|
42
|
-
# noinspection PyUnusedLocal
|
|
43
|
-
def deprecated(
|
|
44
|
-
date, instructions, warn_once=True
|
|
45
|
-
): # pylint: disable=unused-argument
|
|
46
|
-
def deprecated_wrapper(func):
|
|
47
|
-
return func
|
|
48
|
-
|
|
49
|
-
return deprecated_wrapper
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
deprecation.deprecated = deprecated
|
|
53
|
-
|
|
54
|
-
# Custom Modules
|
|
55
|
-
try:
|
|
56
|
-
from ..utils import misc
|
|
57
|
-
|
|
58
|
-
except (ModuleNotFoundError, ValueError, ImportError):
|
|
59
|
-
from pgsui.utils import misc
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# Pandas on pip gives a performance warning when doing the below code.
|
|
63
|
-
# Apparently it's a bug that exists in the pandas version I used here.
|
|
64
|
-
# It can be safely ignored.
|
|
65
|
-
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def encode_onehot(X):
|
|
69
|
-
"""Convert 012-encoded data to one-hot encodings.
|
|
70
|
-
Args:
|
|
71
|
-
X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
|
|
72
|
-
Returns:
|
|
73
|
-
pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan).
|
|
74
|
-
"""
|
|
75
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1], 3))
|
|
76
|
-
mappings = {
|
|
77
|
-
0: np.array([1, 0, 0]),
|
|
78
|
-
1: np.array([0, 1, 0]),
|
|
79
|
-
2: np.array([0, 0, 1]),
|
|
80
|
-
-9: np.array([np.nan, np.nan, np.nan]),
|
|
81
|
-
}
|
|
82
|
-
for row in np.arange(X.shape[0]):
|
|
83
|
-
Xt[row] = [mappings[enc] for enc in X[row]]
|
|
84
|
-
return Xt
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def mle(row):
|
|
88
|
-
"""Get the Maximum Likelihood Estimation for the best prediction. Basically, it sets the index of the maxiumum value in a vector (row) to 1.0, since it is one-hot encoded.
|
|
89
|
-
|
|
90
|
-
Args:
|
|
91
|
-
row (numpy.ndarray(float)): Row vector with predicted values as floating points.
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
numpy.ndarray(float): Row vector with the highest prediction set to 1.0 and the others set to 0.0.
|
|
95
|
-
"""
|
|
96
|
-
res = np.zeros(row.shape[0])
|
|
97
|
-
res[np.argmax(row)] = 1
|
|
98
|
-
return res
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
class UBPInputTransformer(BaseEstimator, TransformerMixin):
|
|
102
|
-
"""Transform input X prior to estimator fitting.
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
n_components (int): Number of principal components currently being used in V.
|
|
106
|
-
|
|
107
|
-
V (numpy.ndarray or Dict[str, Any]): If doing grid search, should be a dictionary with current_component: numpy.ndarray. If not doing grid search, then it should be a numpy.ndarray.
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
def __init__(self, n_components, V):
|
|
111
|
-
self.n_components = n_components
|
|
112
|
-
self.V = V
|
|
113
|
-
|
|
114
|
-
def fit(self, X):
|
|
115
|
-
"""Fit transformer to input data X.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
X (numpy.ndarray): Input data to fit. If numpy.ndarray, then should be of shape (n_samples, n_components). If dictionary, then should be component: numpy.ndarray.
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
self: Class instance.
|
|
122
|
-
"""
|
|
123
|
-
self.n_features_in_ = self.n_components
|
|
124
|
-
return self
|
|
125
|
-
|
|
126
|
-
def transform(self, X):
|
|
127
|
-
"""Transform input data X to the needed format.
|
|
128
|
-
|
|
129
|
-
Args:
|
|
130
|
-
X (numpy.ndarray): Input data to fit. If numpy.ndarray, then should be of shape (n_samples, n_components). If dictionary, then should be component: numpy.ndarray.
|
|
131
|
-
|
|
132
|
-
Returns:
|
|
133
|
-
numpy.ndarray: Formatted input data with correct component.
|
|
134
|
-
|
|
135
|
-
Raises:
|
|
136
|
-
TypeError: V must be a dictionary if phase is None or phase == 1.
|
|
137
|
-
TypeError: V must be a numpy array if phase is 2 or 3.
|
|
138
|
-
"""
|
|
139
|
-
if not isinstance(self.V, dict):
|
|
140
|
-
raise TypeError(f"V must be a dictionary, but got {type(self.V)}")
|
|
141
|
-
return self.V[self.n_components]
|
|
142
7
|
|
|
143
8
|
|
|
144
|
-
class
|
|
145
|
-
"""
|
|
9
|
+
class SimGenotypeDataTransformer:
|
|
10
|
+
"""Simulates missing genotypes at the locus level on a 2D integer matrix.
|
|
146
11
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
Missing and observed boolean masks are also generated.
|
|
12
|
+
This transformer masks a proportion of known genotypes in the input matrix X, setting them to a specified missing value. The masking can be done randomly or based on inverse genotype frequencies, with an option to boost the likelihood of masking heterozygous genotypes.
|
|
150
13
|
|
|
151
14
|
Args:
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def __init__(self, num_classes=3, return_int=False, activate=None):
|
|
160
|
-
self.num_classes = num_classes
|
|
161
|
-
self.return_int = return_int
|
|
162
|
-
self.activate = activate
|
|
163
|
-
|
|
164
|
-
def fit(self, X, y=None):
|
|
165
|
-
"""set attributes used to transform X (input features).
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
X (numpy.ndarray): Input integer-encoded numpy array.
|
|
169
|
-
|
|
170
|
-
y (None): Just for compatibility with sklearn API.
|
|
171
|
-
"""
|
|
172
|
-
X = misc.validate_input_type(X, return_type="array")
|
|
173
|
-
|
|
174
|
-
self.X_decoded = X
|
|
175
|
-
|
|
176
|
-
# VAE uses 4 classes ([A,T,G,C]), SAE uses 3 ([0,1,2]).
|
|
177
|
-
if self.num_classes == 3:
|
|
178
|
-
enc_func = self.encode_012
|
|
179
|
-
elif self.num_classes == 4:
|
|
180
|
-
enc_func = self.encode_multilab
|
|
181
|
-
elif self.num_classes == 10:
|
|
182
|
-
enc_func = self.encode_multiclass
|
|
183
|
-
else:
|
|
184
|
-
raise ValueError(
|
|
185
|
-
f"Invalid value passed to num_classes in "
|
|
186
|
-
f"AutoEncoderFeatureTransformer. Only 3 or 4 are supported, "
|
|
187
|
-
f"but got {self.num_classes}."
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
# Encode the data.
|
|
191
|
-
self.X_train = enc_func(X)
|
|
192
|
-
self.classes_ = np.arange(self.num_classes)
|
|
193
|
-
self.n_classes_ = self.num_classes
|
|
194
|
-
|
|
195
|
-
# Get missing and observed data boolean masks.
|
|
196
|
-
self.missing_mask_, self.observed_mask_ = self._get_masks(self.X_train)
|
|
197
|
-
|
|
198
|
-
# To accomodate multiclass-multioutput.
|
|
199
|
-
self.n_outputs_expected_ = 1
|
|
200
|
-
|
|
201
|
-
self.n_outputs_ = self.X_train.shape[1]
|
|
202
|
-
|
|
203
|
-
return self
|
|
204
|
-
|
|
205
|
-
def transform(self, X):
|
|
206
|
-
"""Transform X to one-hot encoded format.
|
|
207
|
-
|
|
208
|
-
Accomodates multiclass targets with a 3D shape.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
X (numpy.ndarray): One-hot encoded target data of shape (n_samples, n_features, num_classes).
|
|
212
|
-
|
|
213
|
-
Returns:
|
|
214
|
-
numpy.ndarray: Transformed target data in one-hot format of shape (n_samples, n_features, num_classes).
|
|
215
|
-
"""
|
|
216
|
-
if self.return_int:
|
|
217
|
-
return X
|
|
218
|
-
else:
|
|
219
|
-
# X = misc.validate_input_type(X, return_type="array")
|
|
220
|
-
return self._fill(self.X_train, self.missing_mask_)
|
|
221
|
-
|
|
222
|
-
def inverse_transform(self, y, return_proba=False):
|
|
223
|
-
"""Transform target to output format.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
y (numpy.ndarray): Array to inverse transform.
|
|
227
|
-
|
|
228
|
-
return_proba (bool): Just for compatibility with scikeras API.
|
|
229
|
-
"""
|
|
230
|
-
try:
|
|
231
|
-
if self.activate is None:
|
|
232
|
-
y = y.numpy()
|
|
233
|
-
elif self.activate == "softmax":
|
|
234
|
-
y = tf.nn.softmax(y).numpy()
|
|
235
|
-
elif self.activate == "sigmoid":
|
|
236
|
-
y = tf.nn.sigmoid(y).numpy()
|
|
237
|
-
else:
|
|
238
|
-
raise ValueError(
|
|
239
|
-
f"Invalid value passed to keyword argument activate. Valid "
|
|
240
|
-
f"options include: None, 'softmax', or 'sigmoid', but got "
|
|
241
|
-
f"{self.activate}"
|
|
242
|
-
)
|
|
243
|
-
except AttributeError:
|
|
244
|
-
# If numpy array already.
|
|
245
|
-
if self.activate is None:
|
|
246
|
-
y = y.copy()
|
|
247
|
-
elif self.activate == "softmax":
|
|
248
|
-
y = tf.nn.softmax(tf.convert_to_tensor(y)).numpy()
|
|
249
|
-
elif self.activate == "sigmoid":
|
|
250
|
-
y = tf.nn.sigmoid(tf.convert_to_tensor(y)).numpy()
|
|
251
|
-
else:
|
|
252
|
-
raise ValueError(
|
|
253
|
-
f"Invalid value passed to keyword argument activate. Valid "
|
|
254
|
-
f"options include: None, 'softmax', or 'sigmoid', but got "
|
|
255
|
-
f"{self.activate}"
|
|
256
|
-
)
|
|
257
|
-
return y
|
|
258
|
-
|
|
259
|
-
def encode_012(self, X):
|
|
260
|
-
"""Convert 012-encoded data to one-hot encodings.
|
|
261
|
-
Args:
|
|
262
|
-
X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
|
|
263
|
-
Returns:
|
|
264
|
-
pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan).
|
|
265
|
-
"""
|
|
266
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1], 3))
|
|
267
|
-
mappings = {
|
|
268
|
-
0: np.array([1, 0, 0]),
|
|
269
|
-
1: np.array([0, 1, 0]),
|
|
270
|
-
2: np.array([0, 0, 1]),
|
|
271
|
-
-9: np.array([np.nan, np.nan, np.nan]),
|
|
272
|
-
}
|
|
273
|
-
for row in np.arange(X.shape[0]):
|
|
274
|
-
Xt[row] = [mappings[enc] for enc in X[row]]
|
|
275
|
-
return Xt
|
|
276
|
-
|
|
277
|
-
def encode_multilab(self, X, multilab_value=1.0):
|
|
278
|
-
"""Encode 0-9 integer data in multi-label one-hot format.
|
|
279
|
-
Args:
|
|
280
|
-
X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
|
|
281
|
-
|
|
282
|
-
multilab_value (float): Value to use for multilabel target encodings. Defaults to 0.5.
|
|
283
|
-
Returns:
|
|
284
|
-
pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan). multi-label categories will be encoded as 0.5. Otherwise, it will be 1.0.
|
|
285
|
-
"""
|
|
286
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1], 4))
|
|
287
|
-
mappings = {
|
|
288
|
-
0: [1.0, 0.0, 0.0, 0.0],
|
|
289
|
-
1: [0.0, 1.0, 0.0, 0.0],
|
|
290
|
-
2: [0.0, 0.0, 1.0, 0.0],
|
|
291
|
-
3: [0.0, 0.0, 0.0, 1.0],
|
|
292
|
-
4: [multilab_value, multilab_value, 0.0, 0.0],
|
|
293
|
-
5: [multilab_value, 0.0, multilab_value, 0.0],
|
|
294
|
-
6: [multilab_value, 0.0, 0.0, multilab_value],
|
|
295
|
-
7: [0.0, multilab_value, multilab_value, 0.0],
|
|
296
|
-
8: [0.0, multilab_value, 0.0, multilab_value],
|
|
297
|
-
9: [0.0, 0.0, multilab_value, multilab_value],
|
|
298
|
-
-9: [np.nan, np.nan, np.nan, np.nan],
|
|
299
|
-
}
|
|
300
|
-
for row in np.arange(X.shape[0]):
|
|
301
|
-
Xt[row] = [mappings[enc] for enc in X[row]]
|
|
302
|
-
return Xt
|
|
303
|
-
|
|
304
|
-
def decode_multilab(self, X, multilab_value=1.0):
|
|
305
|
-
"""Decode one-hot format data back to 0-9 integer data.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
X (numpy.ndarray): Input array with one-hot-encoded data.
|
|
309
|
-
|
|
310
|
-
multilab_value (float): Value to use for multilabel target encodings. Defaults to 0.5.
|
|
311
|
-
|
|
312
|
-
Returns:
|
|
313
|
-
pandas.DataFrame: Decoded data, with multi-label categories decoded to their original integer representation.
|
|
314
|
-
"""
|
|
315
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1]))
|
|
316
|
-
mappings = {
|
|
317
|
-
tuple([1.0, 0.0, 0.0, 0.0]): 0,
|
|
318
|
-
tuple([0.0, 1.0, 0.0, 0.0]): 1,
|
|
319
|
-
tuple([0.0, 0.0, 1.0, 0.0]): 2,
|
|
320
|
-
tuple([0.0, 0.0, 0.0, 1.0]): 3,
|
|
321
|
-
tuple([multilab_value, multilab_value, 0.0, 0.0]): 4,
|
|
322
|
-
tuple([multilab_value, 0.0, multilab_value, 0.0]): 5,
|
|
323
|
-
tuple([multilab_value, 0.0, 0.0, multilab_value]): 6,
|
|
324
|
-
tuple([0.0, multilab_value, multilab_value, 0.0]): 7,
|
|
325
|
-
tuple([0.0, multilab_value, 0.0, multilab_value]): 8,
|
|
326
|
-
tuple([0.0, 0.0, multilab_value, multilab_value]): 9,
|
|
327
|
-
tuple([np.nan, np.nan, np.nan, np.nan]): -9,
|
|
328
|
-
}
|
|
329
|
-
for row in np.arange(X.shape[0]):
|
|
330
|
-
Xt[row] = [mappings[tuple(enc)] for enc in X[row]]
|
|
331
|
-
return Xt
|
|
332
|
-
|
|
333
|
-
def encode_multiclass(self, X, num_classes=10, missing_value=-9):
|
|
334
|
-
"""Encode 0-9 integer data in multi-class one-hot format.
|
|
335
|
-
|
|
336
|
-
Missing values get encoded as ``[np.nan] * num_classes``
|
|
337
|
-
Args:
|
|
338
|
-
X (numpy.ndarray): Input array with 012-encoded data and ``missing_value`` as the missing data value.
|
|
339
|
-
|
|
340
|
-
num_classes (int, optional): Number of classes to use. Defaults to 10.
|
|
341
|
-
|
|
342
|
-
missing_value (int, optional): Missing data value to replace with ``[np.nan] * num_classes``\. Defaults to -9.
|
|
343
|
-
Returns:
|
|
344
|
-
pandas.DataFrame: Multi-class one-hot encoded data, ignoring missing values (np.nan).
|
|
345
|
-
"""
|
|
346
|
-
int_cats, ohe_arr = np.arange(num_classes), np.eye(num_classes)
|
|
347
|
-
mappings = dict(zip(int_cats, ohe_arr))
|
|
348
|
-
mappings[missing_value] = np.array([np.nan] * num_classes)
|
|
349
|
-
|
|
350
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1], num_classes))
|
|
351
|
-
for row in np.arange(X.shape[0]):
|
|
352
|
-
Xt[row] = [mappings[enc] for enc in X[row]]
|
|
353
|
-
return Xt
|
|
354
|
-
|
|
355
|
-
def _fill(self, data, missing_mask, missing_value=-1):
|
|
356
|
-
"""Mask missing data as ``missing_value``\.
|
|
357
|
-
|
|
358
|
-
Args:
|
|
359
|
-
data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
|
|
360
|
-
|
|
361
|
-
missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value.
|
|
362
|
-
|
|
363
|
-
missing_value (int): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes.
|
|
364
|
-
"""
|
|
365
|
-
if self.num_classes > 1:
|
|
366
|
-
missing_value = [missing_value] * self.num_classes
|
|
367
|
-
data[missing_mask] = missing_value
|
|
368
|
-
return data
|
|
369
|
-
|
|
370
|
-
def _get_masks(self, X):
|
|
371
|
-
"""Format the provided target data for use with UBP/NLPCA.
|
|
372
|
-
|
|
373
|
-
Args:
|
|
374
|
-
y (numpy.ndarray(float)): Input data that will be used as the target of shape (n_samples, n_features, num_classes).
|
|
375
|
-
|
|
376
|
-
Returns:
|
|
377
|
-
numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
|
|
378
|
-
|
|
379
|
-
numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
|
|
380
|
-
"""
|
|
381
|
-
missing_mask = self._create_missing_mask(X)
|
|
382
|
-
observed_mask = ~missing_mask
|
|
383
|
-
return missing_mask, observed_mask
|
|
384
|
-
|
|
385
|
-
def _create_missing_mask(self, data):
|
|
386
|
-
"""Creates a missing data mask with boolean values.
|
|
387
|
-
Args:
|
|
388
|
-
data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
|
|
389
|
-
Returns:
|
|
390
|
-
numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
|
|
391
|
-
"""
|
|
392
|
-
return np.isnan(data).all(axis=2)
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
class MLPTargetTransformer(BaseEstimator, TransformerMixin):
|
|
396
|
-
"""Transformer to format UBP / NLPCA target data both before and after model fitting."""
|
|
397
|
-
|
|
398
|
-
def fit(self, y):
|
|
399
|
-
"""Fit 012-encoded target data.
|
|
400
|
-
|
|
401
|
-
Args:
|
|
402
|
-
y (numpy.ndarray): Target data that is 012-encoded.
|
|
403
|
-
|
|
404
|
-
Returns:
|
|
405
|
-
self: Class instance.
|
|
406
|
-
"""
|
|
407
|
-
y = misc.validate_input_type(y, return_type="array")
|
|
408
|
-
|
|
409
|
-
# Original 012-encoded y
|
|
410
|
-
self.y_decoded_ = y
|
|
411
|
-
|
|
412
|
-
y_train = encode_onehot(y)
|
|
413
|
-
|
|
414
|
-
# Get missing and observed data boolean masks.
|
|
415
|
-
self.missing_mask_, self.observed_mask_ = self._get_masks(y_train)
|
|
416
|
-
|
|
417
|
-
# To accomodate multiclass-multioutput.
|
|
418
|
-
self.n_outputs_expected_ = 1
|
|
419
|
-
|
|
420
|
-
return self
|
|
421
|
-
|
|
422
|
-
def transform(self, y):
|
|
423
|
-
"""Transform y_true to one-hot encoded.
|
|
424
|
-
|
|
425
|
-
Accomodates multiclass-multioutput targets.
|
|
426
|
-
|
|
427
|
-
Args:
|
|
428
|
-
y (numpy.ndarray): One-hot encoded target data.
|
|
429
|
-
|
|
430
|
-
Returns:
|
|
431
|
-
numpy.ndarray: y_true target data.
|
|
432
|
-
"""
|
|
433
|
-
y = misc.validate_input_type(y, return_type="array")
|
|
434
|
-
y_train = encode_onehot(y)
|
|
435
|
-
return self._fill(y_train, self.missing_mask_)
|
|
436
|
-
|
|
437
|
-
def inverse_transform(self, y):
|
|
438
|
-
"""Decode y_pred from one-hot to 012-based encoding.
|
|
439
|
-
|
|
440
|
-
This allows sklearn.metrics to be used.
|
|
441
|
-
|
|
442
|
-
Args:
|
|
443
|
-
y (numpy.ndarray): One-hot encoded predicted probabilities after model fitting.
|
|
444
|
-
|
|
445
|
-
Returns:
|
|
446
|
-
numpy.ndarray: y predictions in same format as y_true.
|
|
447
|
-
"""
|
|
448
|
-
# VAE has tuple output
|
|
449
|
-
if isinstance(y, tuple):
|
|
450
|
-
y = y[0]
|
|
451
|
-
|
|
452
|
-
# Return predictions.
|
|
453
|
-
return tf.nn.softmax(y).numpy()
|
|
454
|
-
|
|
455
|
-
def _fill(self, data, missing_mask, missing_value=-1, num_classes=3):
|
|
456
|
-
"""Mask missing data as ``missing_value``\.
|
|
457
|
-
|
|
458
|
-
Args:
|
|
459
|
-
data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
|
|
460
|
-
|
|
461
|
-
missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value.
|
|
462
|
-
|
|
463
|
-
missing_value (int): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes. Defaults to -1.
|
|
464
|
-
|
|
465
|
-
num_classes (int): Number of classes in dataset. Defaults to 3.
|
|
466
|
-
"""
|
|
467
|
-
if num_classes > 1:
|
|
468
|
-
missing_value = [missing_value] * num_classes
|
|
469
|
-
data[missing_mask] = missing_value
|
|
470
|
-
return data
|
|
471
|
-
|
|
472
|
-
def _get_masks(self, X):
|
|
473
|
-
"""Format the provided target data for use with UBP/NLPCA.
|
|
474
|
-
|
|
475
|
-
Args:
|
|
476
|
-
X (numpy.ndarray(float)): Input data that will be used as the target.
|
|
477
|
-
|
|
478
|
-
Returns:
|
|
479
|
-
numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
|
|
480
|
-
|
|
481
|
-
numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
|
|
482
|
-
"""
|
|
483
|
-
missing_mask = self._create_missing_mask(X)
|
|
484
|
-
observed_mask = ~missing_mask
|
|
485
|
-
return missing_mask, observed_mask
|
|
486
|
-
|
|
487
|
-
def _create_missing_mask(self, data):
|
|
488
|
-
"""Creates a missing data mask with boolean values.
|
|
489
|
-
Args:
|
|
490
|
-
data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
|
|
491
|
-
Returns:
|
|
492
|
-
numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
|
|
493
|
-
"""
|
|
494
|
-
return np.isnan(data).all(axis=2)
|
|
495
|
-
|
|
496
|
-
def _decode(self, y):
|
|
497
|
-
"""Evaluate UBP / NLPCA predictions by calculating the highest predicted value.
|
|
498
|
-
|
|
499
|
-
Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
|
|
500
|
-
|
|
501
|
-
Args:
|
|
502
|
-
y (numpy.ndarray): Input one-hot encoded data.
|
|
503
|
-
|
|
504
|
-
Returns:
|
|
505
|
-
numpy.ndarray: Imputed one-hot encoded values.
|
|
506
|
-
"""
|
|
507
|
-
Xprob = y
|
|
508
|
-
Xt = np.apply_along_axis(mle, axis=2, arr=Xprob)
|
|
509
|
-
Xpred = np.argmax(Xt, axis=2)
|
|
510
|
-
Xtrue = np.argmax(y, axis=2)
|
|
511
|
-
Xdecoded = np.zeros((Xpred.shape[0], Xpred.shape[1]))
|
|
512
|
-
for idx in np.arange(Xdecoded):
|
|
513
|
-
imputed_idx = np.where(self.observed_mask_[idx] == 0)
|
|
514
|
-
known_idx = np.nonzero(self.observed_mask_[idx])
|
|
515
|
-
Xdecoded[idx, imputed_idx] = Xpred[idx, imputed_idx]
|
|
516
|
-
Xdecoded[idx, known_idx] = Xtrue[idx, known_idx]
|
|
517
|
-
return Xdecoded.astype("int8")
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
class UBPTargetTransformer(BaseEstimator, TransformerMixin):
|
|
521
|
-
"""Transformer to format UBP / NLPCA target data both before model fitting.
|
|
522
|
-
|
|
523
|
-
Examples:
|
|
524
|
-
>>>ubp_tt = UBPTargetTransformer()
|
|
525
|
-
>>>y_train = ubp_tt.fit_transform(y)
|
|
526
|
-
"""
|
|
527
|
-
|
|
528
|
-
def fit(self, y):
|
|
529
|
-
"""Fit 012-encoded target data.
|
|
530
|
-
|
|
531
|
-
Args:
|
|
532
|
-
y (numpy.ndarray): Target data that is 012-encoded, of shape (n_samples, n_features).
|
|
533
|
-
|
|
534
|
-
Returns:
|
|
535
|
-
self: Class instance.
|
|
536
|
-
"""
|
|
537
|
-
y = misc.validate_input_type(y, return_type="array")
|
|
538
|
-
|
|
539
|
-
# Original 012-encoded y
|
|
540
|
-
self.y_decoded_ = y
|
|
541
|
-
|
|
542
|
-
# One-hot encode y.
|
|
543
|
-
y_train = encode_onehot(y)
|
|
544
|
-
|
|
545
|
-
# Get missing and observed data boolean masks.
|
|
546
|
-
self.missing_mask_, self.observed_mask_ = self._get_masks(y_train)
|
|
547
|
-
|
|
548
|
-
# To accomodate multiclass-multioutput.
|
|
549
|
-
self.n_outputs_expected_ = 1
|
|
550
|
-
|
|
551
|
-
return self
|
|
552
|
-
|
|
553
|
-
def transform(self, y):
|
|
554
|
-
"""Transform 012-encoded target to one-hot encoded format.
|
|
555
|
-
|
|
556
|
-
Accomodates multiclass-multioutput targets.
|
|
557
|
-
|
|
558
|
-
Args:
|
|
559
|
-
y (numpy.ndarray): One-hot encoded target data of shape (n_samples, n_features).
|
|
560
|
-
|
|
561
|
-
Returns:
|
|
562
|
-
numpy.ndarray: y_true target data.
|
|
563
|
-
"""
|
|
564
|
-
y = misc.validate_input_type(y, return_type="array")
|
|
565
|
-
y_train = encode_onehot(y)
|
|
566
|
-
return self._fill(y_train, self.missing_mask_)
|
|
567
|
-
|
|
568
|
-
def inverse_transform(self, y):
|
|
569
|
-
"""Decode y_predicted from one-hot to 012-integer encoding.
|
|
570
|
-
|
|
571
|
-
Performs a softmax activation for multiclass classification.
|
|
572
|
-
|
|
573
|
-
This allows sklearn.metrics to be used.
|
|
574
|
-
|
|
575
|
-
Args:
|
|
576
|
-
y (numpy.ndarray): One-hot encoded predicted probabilities after model fitting, of shape (n_samples, n_features, num_classes).
|
|
577
|
-
|
|
578
|
-
Returns:
|
|
579
|
-
numpy.ndarray: y predictions in same format as y_true (n_samples, n_features).
|
|
580
|
-
"""
|
|
581
|
-
return tf.nn.softmax(y).numpy()
|
|
582
|
-
|
|
583
|
-
def _fill(self, data, missing_mask, missing_value=-1, num_classes=3):
|
|
584
|
-
"""Mask missing data as ``missing_value``\.
|
|
585
|
-
|
|
586
|
-
Args:
|
|
587
|
-
data (numpy.ndarray): Input with missing values of shape (n_samples, n_features, num_classes).
|
|
588
|
-
|
|
589
|
-
missing_mask (np.ndarray(bool)): Missing data mask with True corresponding to a missing value, of shape (n_samples, n_features).
|
|
590
|
-
|
|
591
|
-
missing_value (int, optional): Value to set missing data to. If a list is provided, then its length should equal the number of one-hot classes. Defaults to -1.
|
|
592
|
-
|
|
593
|
-
num_classes (int, optional): Number of classes to use. Defaults to 3.
|
|
594
|
-
"""
|
|
595
|
-
if num_classes > 1:
|
|
596
|
-
missing_value = [missing_value] * num_classes
|
|
597
|
-
data[missing_mask] = missing_value
|
|
598
|
-
return data
|
|
599
|
-
|
|
600
|
-
def _get_masks(self, y):
|
|
601
|
-
"""Format the provided target data for use with UBP/NLPCA models.
|
|
602
|
-
|
|
603
|
-
Args:
|
|
604
|
-
y (numpy.ndarray(float)): Input data that will be used as the target of shape (n_samples, n_features, num_classes).
|
|
605
|
-
|
|
606
|
-
Returns:
|
|
607
|
-
numpy.ndarray(float): Missing data mask, with missing values encoded as 1's and non-missing as 0's.
|
|
608
|
-
|
|
609
|
-
numpy.ndarray(float): Observed data mask, with non-missing values encoded as 1's and missing values as 0's.
|
|
610
|
-
"""
|
|
611
|
-
missing_mask = self._create_missing_mask(y)
|
|
612
|
-
observed_mask = ~missing_mask
|
|
613
|
-
return missing_mask, observed_mask
|
|
614
|
-
|
|
615
|
-
def _create_missing_mask(self, data):
|
|
616
|
-
"""Creates a missing data mask with boolean values.
|
|
617
|
-
|
|
618
|
-
Args:
|
|
619
|
-
data (numpy.ndarray): Data to generate missing mask from, of shape (n_samples, n_features, n_classes).
|
|
620
|
-
|
|
621
|
-
Returns:
|
|
622
|
-
numpy.ndarray(bool): Boolean mask of missing values of shape (n_samples, n_features), with True corresponding to a missing data point.
|
|
623
|
-
"""
|
|
624
|
-
return np.isnan(data).all(axis=2)
|
|
625
|
-
|
|
626
|
-
def _decode(self, y):
|
|
627
|
-
"""Evaluate UBP/NLPCA predictions by calculating the argmax.
|
|
628
|
-
|
|
629
|
-
Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
|
|
630
|
-
|
|
631
|
-
Args:
|
|
632
|
-
y (numpy.ndarray): Input one-hot encoded data of shape (n_samples, n_features, num_classes).
|
|
633
|
-
|
|
634
|
-
Returns:
|
|
635
|
-
numpy.ndarray: Imputed one-hot encoded values.
|
|
636
|
-
"""
|
|
637
|
-
Xprob = y
|
|
638
|
-
Xt = np.apply_along_axis(mle, axis=2, arr=Xprob)
|
|
639
|
-
Xpred = np.argmax(Xt, axis=2)
|
|
640
|
-
Xtrue = np.argmax(y, axis=2)
|
|
641
|
-
Xdecoded = np.zeros((Xpred.shape[0], Xpred.shape[1]))
|
|
642
|
-
for idx in np.arange(Xdecoded):
|
|
643
|
-
imputed_idx = np.where(self.observed_mask_[idx] == 0)
|
|
644
|
-
known_idx = np.nonzero(self.observed_mask_[idx])
|
|
645
|
-
Xdecoded[idx, imputed_idx] = Xpred[idx, imputed_idx]
|
|
646
|
-
Xdecoded[idx, known_idx] = Xtrue[idx, known_idx]
|
|
647
|
-
return Xdecoded.astype("int8")
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
class SimGenotypeDataTransformer(BaseEstimator, TransformerMixin):
|
|
651
|
-
"""Simulate missing data on genotypes read/ encoded in a GenotypeData object.
|
|
652
|
-
|
|
653
|
-
Copies metadata from a GenotypeData object and simulates user-specified proportion of missing data
|
|
654
|
-
|
|
655
|
-
Args:
|
|
656
|
-
genotype_data (GenotypeData object): GenotypeData instance.
|
|
657
|
-
|
|
658
|
-
prop_missing (float, optional): Proportion of missing data desired in output. Defaults to 0.1
|
|
659
|
-
|
|
660
|
-
strategy (str, optional): Strategy for simulating missing data. May be one of: "nonrandom", "nonrandom_weighted", "random_weighted", "random_weighted_inv", or "random". When set to "nonrandom", branches from GenotypeData.guidetree will be randomly sampled to generate missing data on descendant nodes. For "nonrandom_weighted", missing data will be placed on nodes proportionally to their branch lengths (e.g., to generate data distributed as might be the case with mutation-disruption of RAD sites). Defaults to "random"
|
|
661
|
-
|
|
662
|
-
missing_val (int, optional): Value that represents missing data. Defaults to -9.
|
|
663
|
-
|
|
664
|
-
mask_missing (bool, optional): True if you want to skip original missing values when simulating new missing data, False otherwise. Defaults to True.
|
|
665
|
-
|
|
666
|
-
verbose (bool, optional): Verbosity level. Defaults to 0.
|
|
667
|
-
|
|
668
|
-
tol (float): Tolerance to reach proportion specified in self.prop_missing. Defaults to 1/num_snps*num_inds
|
|
669
|
-
|
|
670
|
-
max_tries (int): Maximum number of tries to reach targeted missing data proportion within specified tol. If None, num_inds will be used. Defaults to None.
|
|
671
|
-
|
|
672
|
-
Attributes:
|
|
673
|
-
|
|
674
|
-
original_missing_mask_ (numpy.ndarray): Array with boolean mask for original missing locations.
|
|
675
|
-
|
|
676
|
-
simulated_missing_mask_ (numpy.ndarray): Array with boolean mask for simulated missing locations, excluding the original ones.
|
|
677
|
-
|
|
678
|
-
all_missing_mask_ (numpy.ndarray): Array with boolean mask for all missing locations, including both simulated and original.
|
|
679
|
-
|
|
680
|
-
Properties:
|
|
681
|
-
missing_count (int): Number of genotypes masked by chosen missing data strategy
|
|
682
|
-
|
|
683
|
-
prop_missing_real (float): True proportion of missing data generated using chosen strategy
|
|
684
|
-
|
|
685
|
-
mask (numpy.ndarray): 2-dimensional array tracking the indices of sampled missing data sites (n_samples, n_sites)
|
|
15
|
+
prop_missing (float): Proportion of *known* loci to mask (0..1).
|
|
16
|
+
strategy (Literal): Strategy name.
|
|
17
|
+
missing_val (int): Missing code value (default: -9).
|
|
18
|
+
seed (int | None): RNG seed.
|
|
19
|
+
logger (logging.Logger | None): Logger for messages.
|
|
20
|
+
het_boost (float): Multiplier for heterozygotes in inv-genotype mode.
|
|
686
21
|
"""
|
|
687
22
|
|
|
688
23
|
def __init__(
|
|
689
24
|
self,
|
|
690
|
-
genotype_data,
|
|
691
25
|
*,
|
|
692
|
-
prop_missing=0.1,
|
|
693
|
-
strategy="random",
|
|
694
|
-
missing_val
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
self.genotype_data = genotype_data
|
|
701
|
-
self.prop_missing = prop_missing
|
|
26
|
+
prop_missing: float = 0.1,
|
|
27
|
+
strategy: Literal["random", "random_inv_genotype"] = "random",
|
|
28
|
+
missing_val: int = -1,
|
|
29
|
+
seed: int | None = None,
|
|
30
|
+
logger: logging.Logger | None = None,
|
|
31
|
+
het_boost: float = 1.0,
|
|
32
|
+
):
|
|
33
|
+
self.prop_missing = float(prop_missing)
|
|
702
34
|
self.strategy = strategy
|
|
703
|
-
self.missing_val = missing_val
|
|
704
|
-
self.
|
|
705
|
-
self.
|
|
706
|
-
self.
|
|
707
|
-
self.
|
|
708
|
-
|
|
709
|
-
def fit(self, X):
|
|
710
|
-
"""Fit to input data X by simulating missing data.
|
|
35
|
+
self.missing_val = int(missing_val)
|
|
36
|
+
self.seed = seed
|
|
37
|
+
self.rng = np.random.default_rng(seed)
|
|
38
|
+
self.het_boost = float(het_boost)
|
|
39
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
711
40
|
|
|
712
|
-
|
|
41
|
+
def fit(self, X, y=None) -> "SimGenotypeDataTransformer":
|
|
42
|
+
"""Stateless.
|
|
713
43
|
|
|
714
44
|
Args:
|
|
715
|
-
X (
|
|
716
|
-
|
|
717
|
-
Raises:
|
|
718
|
-
TypeError: SimGenotypeData.tree must not be NoneType when using strategy="nonrandom" or "nonrandom_weighted".
|
|
719
|
-
|
|
720
|
-
ValueError: Invalid ``strategy`` parameter provided.
|
|
45
|
+
X (np.ndarray): (n_samples, n_features), integer codes {0..9} or <0 as missing.
|
|
46
|
+
y: Ignored.
|
|
721
47
|
"""
|
|
722
|
-
X = misc.validate_input_type(X, return_type="array").astype("float32")
|
|
723
|
-
|
|
724
|
-
if self.verbose > 0:
|
|
725
|
-
print(
|
|
726
|
-
f"\nAdding {self.prop_missing} missing data per column "
|
|
727
|
-
f"using strategy: {self.strategy}"
|
|
728
|
-
)
|
|
729
|
-
|
|
730
|
-
if np.all(np.isnan(np.array([self.missing_val])) == False):
|
|
731
|
-
X[X == self.missing_val] = np.nan
|
|
732
|
-
|
|
733
|
-
self.original_missing_mask_ = np.isnan(X)
|
|
734
|
-
|
|
735
|
-
if self.strategy == "random":
|
|
736
|
-
if self.mask_missing:
|
|
737
|
-
# Get indexes where non-missing (Xobs) and missing (Xmiss).
|
|
738
|
-
Xobs = np.where(~self.original_missing_mask_.ravel())[0]
|
|
739
|
-
Xmiss = np.where(self.original_missing_mask_.ravel())[0]
|
|
740
|
-
|
|
741
|
-
# Generate mask of 0's (non-missing) and 1's (missing).
|
|
742
|
-
obs_mask = np.random.choice(
|
|
743
|
-
[0, 1],
|
|
744
|
-
size=Xobs.size,
|
|
745
|
-
p=((1 - self.prop_missing), self.prop_missing),
|
|
746
|
-
).astype(bool)
|
|
747
|
-
|
|
748
|
-
# Make missing data mask.
|
|
749
|
-
mask = np.zeros(X.size)
|
|
750
|
-
mask[Xobs] = obs_mask
|
|
751
|
-
mask[Xmiss] = 1
|
|
752
|
-
|
|
753
|
-
# Reshape from raveled to 2D.
|
|
754
|
-
# With strategy=="random", mask_ is equal to all_missing_.
|
|
755
|
-
self.mask_ = np.reshape(mask, X.shape)
|
|
756
|
-
|
|
757
|
-
else:
|
|
758
|
-
# Generate mask of 0's (non-missing) and 1's (missing).
|
|
759
|
-
self.mask_ = np.random.choice(
|
|
760
|
-
[0, 1],
|
|
761
|
-
size=X.shape,
|
|
762
|
-
p=((1 - self.prop_missing), self.prop_missing),
|
|
763
|
-
).astype(bool)
|
|
764
|
-
|
|
765
|
-
# Make sure no entirely missing columns were simulated.
|
|
766
|
-
self._validate_mask()
|
|
767
|
-
|
|
768
|
-
elif self.strategy == "random_weighted":
|
|
769
|
-
self.mask_ = self.random_weighted_missing_data(X, inv=False)
|
|
770
|
-
|
|
771
|
-
elif self.strategy == "random_weighted_inv":
|
|
772
|
-
self.mask_ = self.random_weighted_missing_data(X, inv=True)
|
|
773
|
-
|
|
774
|
-
elif (
|
|
775
|
-
self.strategy == "nonrandom"
|
|
776
|
-
or self.strategy == "nonrandom_weighted"
|
|
777
|
-
):
|
|
778
|
-
if self.genotype_data.tree is None:
|
|
779
|
-
raise TypeError(
|
|
780
|
-
"SimGenotypeData.tree cannot be NoneType when "
|
|
781
|
-
"strategy='nonrandom' or 'nonrandom_weighted'"
|
|
782
|
-
)
|
|
783
|
-
|
|
784
|
-
mask = np.full_like(X, 0.0, dtype=bool)
|
|
785
|
-
|
|
786
|
-
if self.strategy == "nonrandom_weighted":
|
|
787
|
-
weighted = True
|
|
788
|
-
else:
|
|
789
|
-
weighted = False
|
|
790
|
-
|
|
791
|
-
sample_map = dict()
|
|
792
|
-
for i, sample in enumerate(self.genotype_data.samples):
|
|
793
|
-
sample_map[sample] = i
|
|
794
|
-
|
|
795
|
-
# if no tolerance provided, set to 1 snp position
|
|
796
|
-
if self.tol is None:
|
|
797
|
-
self.tol = 1.0 / mask.size
|
|
798
|
-
|
|
799
|
-
# if no max_tries provided, set to # inds
|
|
800
|
-
if self.max_tries is None:
|
|
801
|
-
self.max_tries = mask.shape[0]
|
|
802
|
-
|
|
803
|
-
filled = False
|
|
804
|
-
while not filled:
|
|
805
|
-
# Get list of samples from tree
|
|
806
|
-
samples = self._sample_tree(
|
|
807
|
-
internal_only=False, skip_root=True, weighted=weighted
|
|
808
|
-
)
|
|
809
|
-
|
|
810
|
-
# Convert to row indices
|
|
811
|
-
rows = [sample_map[i] for i in samples]
|
|
812
|
-
|
|
813
|
-
# Randomly sample a column
|
|
814
|
-
col_idx = np.random.randint(0, mask.shape[1])
|
|
815
|
-
sampled_col = copy.copy(mask[:, col_idx])
|
|
816
|
-
miss_mask = copy.copy(self.original_missing_mask_[:, col_idx])
|
|
817
|
-
|
|
818
|
-
# Mask column
|
|
819
|
-
sampled_col[rows] = True
|
|
820
|
-
|
|
821
|
-
# If original was missing, set back to False.
|
|
822
|
-
if self.mask_missing:
|
|
823
|
-
sampled_col[miss_mask] = False
|
|
824
|
-
|
|
825
|
-
# check that column is not 100% missing now
|
|
826
|
-
# if yes, sample again
|
|
827
|
-
if np.sum(sampled_col) == sampled_col.size:
|
|
828
|
-
continue
|
|
829
|
-
|
|
830
|
-
# if not, set values in mask matrix
|
|
831
|
-
else:
|
|
832
|
-
mask[:, col_idx] = sampled_col
|
|
833
|
-
|
|
834
|
-
# if this addition pushes missing % > self.prop_missing,
|
|
835
|
-
# check previous prop_missing, remove masked samples from
|
|
836
|
-
# this column until closest to target prop_missing
|
|
837
|
-
current_prop = np.sum(mask) / mask.size
|
|
838
|
-
if abs(current_prop - self.prop_missing) <= self.tol:
|
|
839
|
-
filled = True
|
|
840
|
-
break
|
|
841
|
-
elif current_prop > self.prop_missing:
|
|
842
|
-
tries = 0
|
|
843
|
-
while (
|
|
844
|
-
abs(current_prop - self.prop_missing) > self.tol
|
|
845
|
-
and tries < self.max_tries
|
|
846
|
-
):
|
|
847
|
-
r = np.random.randint(0, mask.shape[0])
|
|
848
|
-
c = np.random.randint(0, mask.shape[1])
|
|
849
|
-
mask[r, c] = False
|
|
850
|
-
tries += 1
|
|
851
|
-
current_prop = np.sum(mask) / mask.size
|
|
852
|
-
|
|
853
|
-
filled = True
|
|
854
|
-
else:
|
|
855
|
-
continue
|
|
856
|
-
|
|
857
|
-
# With strategy=="nonrandom" or "nonrandom_weighted",
|
|
858
|
-
# mask_ is equal to sim_missing_mask_ if mask_missing is True.
|
|
859
|
-
# Otherwise it is equal to all_missing_.
|
|
860
|
-
self.mask_ = mask
|
|
861
|
-
|
|
862
|
-
self._validate_mask()
|
|
863
|
-
|
|
864
|
-
else:
|
|
865
|
-
raise ValueError(
|
|
866
|
-
"Invalid SimGenotypeData.strategy value:", self.strategy
|
|
867
|
-
)
|
|
868
|
-
|
|
869
|
-
# Get all missing values.
|
|
870
|
-
self.all_missing_mask_ = np.logical_or(
|
|
871
|
-
self.mask_, self.original_missing_mask_
|
|
872
|
-
)
|
|
873
|
-
# Get values where original value was not missing and simulated.
|
|
874
|
-
# data is missing.
|
|
875
|
-
self.sim_missing_mask_ = np.logical_and(
|
|
876
|
-
self.all_missing_mask_, self.original_missing_mask_ == False
|
|
877
|
-
)
|
|
878
|
-
|
|
879
|
-
self._validate_mask(mask=self.mask_missing)
|
|
880
|
-
|
|
881
48
|
return self
|
|
882
49
|
|
|
883
|
-
def transform(self, X):
|
|
884
|
-
"""
|
|
50
|
+
def transform(self, X: np.ndarray) -> tuple[np.ndarray, dict]:
|
|
51
|
+
"""Apply missing-data simulation on a 2D genotype matrix.
|
|
885
52
|
|
|
886
53
|
Args:
|
|
887
|
-
X (
|
|
54
|
+
X (np.ndarray): (n_samples, n_features), integer codes {0..9} or <0 as missing.
|
|
888
55
|
|
|
889
56
|
Returns:
|
|
890
|
-
|
|
57
|
+
tuple[np.ndarray, dict]: (X_masked, masks) where masks has keys: 'original': original missing (boolean 2D). 'simulated': loci masked here (boolean 2D). 'all': union of original + simulated (boolean 2D)
|
|
891
58
|
"""
|
|
892
|
-
X
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
def accuracy(self, X_true, X_pred):
|
|
898
|
-
"""Calculate imputation accuracy of the simulated genotypes.
|
|
59
|
+
if X.ndim != 2:
|
|
60
|
+
msg = f"X must be 2D, got shape {X.shape}"
|
|
61
|
+
self.logger.error(msg)
|
|
62
|
+
raise ValueError(msg)
|
|
899
63
|
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
X_pred (np.ndarray): Imputed values.
|
|
64
|
+
X = np.asarray(X)
|
|
65
|
+
original_mask = X < 0
|
|
904
66
|
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
masked_sites = np.sum(self.sim_missing_mask_)
|
|
909
|
-
num_correct = np.sum(
|
|
910
|
-
X_true[self.sim_missing_mask_] == X_pred[self.sim_missing_mask_]
|
|
911
|
-
)
|
|
912
|
-
return num_correct / masked_sites
|
|
67
|
+
sim_mask = self._simulate_missing_mask(X, original_mask)
|
|
68
|
+
sim_mask = sim_mask & (~original_mask)
|
|
69
|
+
sim_mask = self._validate_mask(sim_mask)
|
|
913
70
|
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
Args:
|
|
918
|
-
X_true (np.ndarray): True values.
|
|
919
|
-
|
|
920
|
-
X_pred (np.ndarray): Imputed values.
|
|
921
|
-
|
|
922
|
-
Returns:
|
|
923
|
-
List[float]: List of AUC-ROC scores in order of: 0,1,2.
|
|
924
|
-
List[float]: List of precision scores in order of: 0,1,2.
|
|
925
|
-
List[float]: List of recall scores in order of: 0,1,2.
|
|
926
|
-
List[float]: List of average precision scores in order of 0,1,2.
|
|
927
|
-
|
|
928
|
-
"""
|
|
929
|
-
y_true = X_true[self.sim_missing_mask_]
|
|
930
|
-
y_pred = X_pred[self.sim_missing_mask_]
|
|
931
|
-
|
|
932
|
-
# Binarize the output
|
|
933
|
-
y_true_bin = label_binarize(y_true, classes=[0, 1, 2])
|
|
934
|
-
y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2])
|
|
935
|
-
|
|
936
|
-
# Initialize lists to hold the scores for each class
|
|
937
|
-
auc_roc_scores = []
|
|
938
|
-
precision_scores = []
|
|
939
|
-
recall_scores = []
|
|
940
|
-
avg_precision_scores = []
|
|
941
|
-
|
|
942
|
-
for i in range(y_true_bin.shape[1]):
|
|
943
|
-
# AUC-ROC score
|
|
944
|
-
auc_roc = roc_auc_score(
|
|
945
|
-
y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
|
|
946
|
-
)
|
|
947
|
-
auc_roc_scores.append(auc_roc)
|
|
948
|
-
|
|
949
|
-
# Precision-recall score
|
|
950
|
-
precision, recall, _, _ = precision_recall_fscore_support(
|
|
951
|
-
y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
|
|
952
|
-
)
|
|
953
|
-
precision_scores.append(precision)
|
|
954
|
-
recall_scores.append(recall)
|
|
955
|
-
|
|
956
|
-
# Average precision score
|
|
957
|
-
avg_precision = average_precision_score(
|
|
958
|
-
y_true_bin[:, i], y_pred_bin[:, i], average="weighted"
|
|
959
|
-
)
|
|
960
|
-
avg_precision_scores.append(avg_precision)
|
|
71
|
+
all_mask = original_mask | sim_mask
|
|
72
|
+
Xt = X.copy()
|
|
73
|
+
Xt[all_mask] = self.missing_val
|
|
961
74
|
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
precision_scores,
|
|
965
|
-
recall_scores,
|
|
966
|
-
avg_precision_scores,
|
|
967
|
-
)
|
|
75
|
+
masks = {"original": original_mask, "simulated": sim_mask, "all": all_mask}
|
|
76
|
+
return Xt, masks
|
|
968
77
|
|
|
969
|
-
|
|
970
|
-
|
|
78
|
+
# ---- strategies ----
|
|
79
|
+
def _simulate_missing_mask(
|
|
80
|
+
self, X: np.ndarray, original_mask: np.ndarray
|
|
81
|
+
) -> np.ndarray:
|
|
82
|
+
"""Simulate missingness mask based on the chosen strategy.
|
|
971
83
|
|
|
972
84
|
Args:
|
|
973
|
-
X (np.ndarray):
|
|
974
|
-
|
|
975
|
-
inv (bool, optional): If True, then biases towards choosing majority alleles. If False, then generates a stratified random sample (class proportions ~= full dataset) Defaults to False.
|
|
85
|
+
X (np.ndarray): Input genotype matrix.
|
|
86
|
+
original_mask (np.ndarray): Boolean mask of original missing values.
|
|
976
87
|
|
|
977
88
|
Returns:
|
|
978
|
-
np.ndarray:
|
|
979
|
-
|
|
89
|
+
np.ndarray: Simulated missing mask.
|
|
980
90
|
"""
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
class_weights = 1 / counts
|
|
986
|
-
else:
|
|
987
|
-
class_weights = counts
|
|
988
|
-
# Normalize class weights
|
|
989
|
-
class_weights = class_weights / sum(class_weights)
|
|
990
|
-
|
|
991
|
-
# Compute mask
|
|
992
|
-
if self.mask_missing:
|
|
993
|
-
# Get indexes where non-missing (Xobs) and missing (Xmiss)
|
|
994
|
-
Xobs = np.where(~self.original_missing_mask_.ravel())[0]
|
|
995
|
-
Xmiss = np.where(self.original_missing_mask_.ravel())[0]
|
|
91
|
+
if self.strategy == "random":
|
|
92
|
+
return self._simulate_random(original_mask)
|
|
93
|
+
elif self.strategy == "random_inv_genotype":
|
|
94
|
+
return self._simulate_inv_genotype(X, original_mask)
|
|
996
95
|
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
)
|
|
1001
|
-
obs_mask = (obs_mask == classes[:, None]).argmax(axis=0)
|
|
96
|
+
msg = "strategy must be one of {'random','random_inv_genotype'}"
|
|
97
|
+
self.logger.error(msg)
|
|
98
|
+
raise ValueError(msg)
|
|
1002
99
|
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
100
|
+
def _simulate_random(self, original_mask: np.ndarray) -> np.ndarray:
|
|
101
|
+
rows, cols = np.where(~original_mask)
|
|
102
|
+
n_known = len(rows)
|
|
103
|
+
mask = np.zeros_like(original_mask, dtype=bool)
|
|
1007
104
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
else:
|
|
1011
|
-
# Generate mask of 0's (non-missing) and 1's (missing)
|
|
1012
|
-
mask = np.random.choice(classes, size=X.size, p=class_weights)
|
|
1013
|
-
mask = (mask == classes[:, None]).argmax(axis=0).reshape(X.shape)
|
|
105
|
+
if n_known == 0:
|
|
106
|
+
return mask
|
|
1014
107
|
|
|
1015
|
-
|
|
1016
|
-
self.mask_ = mask
|
|
108
|
+
n_to_mask = int(np.floor(self.prop_missing * n_known))
|
|
1017
109
|
|
|
1018
|
-
|
|
110
|
+
if n_to_mask <= 0:
|
|
111
|
+
return mask
|
|
1019
112
|
|
|
113
|
+
idx = self.rng.choice(n_known, size=n_to_mask, replace=False)
|
|
114
|
+
mask[rows[idx], cols[idx]] = True
|
|
1020
115
|
return mask
|
|
1021
116
|
|
|
1022
|
-
def
|
|
1023
|
-
self,
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
skip_root=True,
|
|
1027
|
-
weighted=False,
|
|
1028
|
-
):
|
|
1029
|
-
"""Function for randomly sampling clades from SimGenotypeData.tree.
|
|
117
|
+
def _simulate_inv_genotype(
|
|
118
|
+
self, X: np.ndarray, original_mask: np.ndarray
|
|
119
|
+
) -> np.ndarray:
|
|
120
|
+
"""Simulate missingness mask inversely proportional to genotype frequencies.
|
|
1030
121
|
|
|
1031
122
|
Args:
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
tips_only (bool): Only sample from tips. Defaults to False.
|
|
1035
|
-
|
|
1036
|
-
skip_root (bool): Exclude sampling of root node. Defaults to True.
|
|
1037
|
-
|
|
1038
|
-
weighted (bool): Weight sampling by branch length. Defaults to False.
|
|
123
|
+
X (np.ndarray): Input genotype matrix.
|
|
124
|
+
original_mask (np.ndarray): Boolean mask of original missing values.
|
|
1039
125
|
|
|
1040
126
|
Returns:
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
Raises:
|
|
1044
|
-
ValueError: ``tips_only`` and ``internal_only`` cannot both be True.
|
|
127
|
+
np.ndarray: Simulated missing mask. 0..3: homozygous (0,1,2,3). 4..9: heterozygous (0/1,0/2,0/3,1/2,1/3,2/3).
|
|
1045
128
|
"""
|
|
1046
129
|
|
|
1047
|
-
|
|
1048
|
-
|
|
130
|
+
rows, cols = np.where(~original_mask)
|
|
131
|
+
n_known = len(rows)
|
|
132
|
+
mask = np.zeros_like(original_mask, dtype=bool)
|
|
133
|
+
if n_known == 0:
|
|
134
|
+
return mask
|
|
1049
135
|
|
|
1050
|
-
#
|
|
1051
|
-
|
|
136
|
+
# Global genotype frequencies (0..9) from all known
|
|
137
|
+
vals = X[~original_mask].astype(int)
|
|
138
|
+
vals = vals[(vals >= 0) & (vals < 10)]
|
|
139
|
+
if vals.size == 0:
|
|
140
|
+
return self._simulate_random(original_mask)
|
|
1052
141
|
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
## node.dist is branch lengths.
|
|
1056
|
-
if skip_root:
|
|
1057
|
-
# If root node.
|
|
1058
|
-
if node.idx == self.genotype_data.tree.nnodes - 1:
|
|
1059
|
-
continue
|
|
142
|
+
cnt = np.bincount(vals, minlength=10).astype(float)
|
|
143
|
+
freqs = cnt / (cnt.sum() + 1e-12)
|
|
1060
144
|
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
)
|
|
145
|
+
# Candidate weights
|
|
146
|
+
geno_known = X[rows, cols].astype(int) # (n_known,)
|
|
147
|
+
inv = 1.0 / (freqs[geno_known] + 1e-12)
|
|
1065
148
|
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
if node.is_leaf():
|
|
1071
|
-
continue
|
|
1072
|
-
node_dict[node.idx] = node.dist
|
|
1073
|
-
if weighted:
|
|
1074
|
-
s = sum(list(node_dict.values()))
|
|
1075
|
-
# Node index / sum of node distances.
|
|
1076
|
-
p = [i / s for i in list(node_dict.values())]
|
|
1077
|
-
node_idx = np.random.choice(list(node_dict.keys()), size=1, p=p)[0]
|
|
1078
|
-
else:
|
|
1079
|
-
# Get missing choice from random clade.
|
|
1080
|
-
node_idx = np.random.choice(list(node_dict.keys()), size=1)[0]
|
|
1081
|
-
return self.genotype_data.tree.get_tip_labels(idx=node_idx)
|
|
149
|
+
# Optional het boost (heterozygous codes are 4..9)
|
|
150
|
+
if self.het_boost != 1.0:
|
|
151
|
+
is_het = (geno_known >= 4) & (geno_known <= 9)
|
|
152
|
+
inv = inv * np.where(is_het, self.het_boost, 1.0)
|
|
1082
153
|
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
mask = self.mask_
|
|
1087
|
-
for i, column in enumerate(self.mask_.T):
|
|
1088
|
-
if mask:
|
|
1089
|
-
miss_mask = self.original_missing_mask_[:, i]
|
|
1090
|
-
col = column[~miss_mask]
|
|
1091
|
-
obs_idx = np.where(~miss_mask)
|
|
1092
|
-
idx = obs_idx[np.random.choice(np.arange(len(obs_idx)))]
|
|
1093
|
-
else:
|
|
1094
|
-
col = column
|
|
1095
|
-
idx = np.random.choice(np.arange(col.shape[0]))
|
|
1096
|
-
if np.sum(col) == col.size:
|
|
1097
|
-
self.mask_[idx, i] = False
|
|
154
|
+
n_to_mask = int(np.floor(self.prop_missing * n_known))
|
|
155
|
+
if n_to_mask <= 0:
|
|
156
|
+
return mask
|
|
1098
157
|
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
mask_val = [0.0, 0.0, 0.0, 0.0]
|
|
1104
|
-
elif len(X.shape) == 2:
|
|
1105
|
-
# 012-encoded.
|
|
1106
|
-
mask_val = -9
|
|
1107
|
-
else:
|
|
1108
|
-
raise ValueError(f"Invalid shape of input X: {X.shape}")
|
|
1109
|
-
|
|
1110
|
-
Xt = X.copy()
|
|
1111
|
-
mask_boolean = self.mask_ != 0
|
|
1112
|
-
Xt[mask_boolean] = mask_val
|
|
1113
|
-
return Xt
|
|
1114
|
-
|
|
1115
|
-
def write_mask(self, filename_prefix):
|
|
1116
|
-
"""Write mask to file.
|
|
1117
|
-
|
|
1118
|
-
Args:
|
|
1119
|
-
filename_prefix (str): Prefix for the filenames to write to.
|
|
1120
|
-
"""
|
|
1121
|
-
np.save(filename_prefix + "_mask.npy", self.mask_)
|
|
1122
|
-
np.save(filename_prefix + "_original_missing_mask.npy", self.original_missing_mask_)
|
|
158
|
+
probs = inv / (inv.sum() + 1e-12)
|
|
159
|
+
idx = self.rng.choice(n_known, size=n_to_mask, replace=False, p=probs)
|
|
160
|
+
mask[rows[idx], cols[idx]] = True
|
|
161
|
+
return mask
|
|
1123
162
|
|
|
1124
|
-
def
|
|
1125
|
-
"""
|
|
163
|
+
def _validate_mask(self, mask: np.ndarray) -> np.ndarray:
|
|
164
|
+
"""Avoid fully-masked rows/columns.
|
|
1126
165
|
|
|
1127
166
|
Args:
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
Returns:
|
|
1131
|
-
tuple of np.ndarray: The read masks.
|
|
1132
|
-
"""
|
|
1133
|
-
# Check if files exist
|
|
1134
|
-
if not os.path.isfile(filename_prefix + "_mask.npy"):
|
|
1135
|
-
raise FileNotFoundError(filename_prefix + "_mask.npy" + " does not exist.")
|
|
1136
|
-
if not os.path.isfile(filename_prefix + "_original_missing_mask.npy"):
|
|
1137
|
-
raise FileNotFoundError(filename_prefix + "_original_missing_mask.npy" + " does not exist.")
|
|
1138
|
-
|
|
1139
|
-
# Load mask from file
|
|
1140
|
-
self.mask_ = np.load(filename_prefix + "_mask.npy")
|
|
1141
|
-
self.original_missing_mask_ = np.load(filename_prefix + "_original_missing_mask.npy")
|
|
1142
|
-
|
|
1143
|
-
# Recalculate all_missing_mask_ from mask_ and original_missing_mask_
|
|
1144
|
-
self.all_missing_mask_ = np.logical_or(self.mask_, self.original_missing_mask_)
|
|
1145
|
-
|
|
1146
|
-
return self.mask_, self.original_missing_mask_, self.all_missing_mask_
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
@property
|
|
1150
|
-
def missing_count(self) -> int:
|
|
1151
|
-
"""Count of masked genotypes in SimGenotypeData.mask
|
|
167
|
+
mask (np.ndarray): Input boolean mask.
|
|
1152
168
|
|
|
1153
169
|
Returns:
|
|
1154
|
-
|
|
170
|
+
np.ndarray: Validated mask.
|
|
1155
171
|
"""
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
172
|
+
rng = self.rng
|
|
173
|
+
# columns
|
|
174
|
+
full_cols = np.where(mask.all(axis=0))[0]
|
|
175
|
+
for c in full_cols:
|
|
176
|
+
r = int(rng.integers(0, mask.shape[0]))
|
|
177
|
+
mask[r, c] = False
|
|
178
|
+
# rows
|
|
179
|
+
full_rows = np.where(mask.all(axis=1))[0]
|
|
180
|
+
for r in full_rows:
|
|
181
|
+
c = int(rng.integers(0, mask.shape[1]))
|
|
182
|
+
mask[r, c] = False
|
|
183
|
+
return mask
|