pg-sui 0.2.3__py3-none-any.whl → 1.6.16a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg_sui-1.6.16a3.dist-info/METADATA +292 -0
- pg_sui-1.6.16a3.dist-info/RECORD +81 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.16a3.dist-info}/WHEEL +1 -1
- pg_sui-1.6.16a3.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.16a3.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.16a3.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +922 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1436 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1121 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1361 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1666 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1660 -0
- pgsui/impute/unsupervised/imputers/vae.py +1316 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.3.dist-info/METADATA +0 -322
- pg_sui-0.2.3.dist-info/RECORD +0 -75
- pg_sui-0.2.3.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
|
@@ -1,1395 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import math
|
|
3
|
-
import os
|
|
4
|
-
import sys
|
|
5
|
-
import random
|
|
6
|
-
import warnings
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
import seaborn as sns
|
|
11
|
-
import matplotlib.pyplot as plt
|
|
12
|
-
|
|
13
|
-
from sklearn.utils.class_weight import (
|
|
14
|
-
compute_class_weight,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
from sklearn.metrics import f1_score
|
|
18
|
-
|
|
19
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
20
|
-
logging.getLogger("tensorflow").disabled = True
|
|
21
|
-
warnings.filterwarnings("ignore", category=UserWarning)
|
|
22
|
-
|
|
23
|
-
# noinspection PyPackageRequirements
|
|
24
|
-
import tensorflow as tf
|
|
25
|
-
|
|
26
|
-
# Disable can't find cuda .dll errors. Also turns of GPU support.
|
|
27
|
-
tf.config.set_visible_devices([], "GPU")
|
|
28
|
-
|
|
29
|
-
from tensorflow.python.util import deprecation
|
|
30
|
-
|
|
31
|
-
# Disable warnings and info logs.
|
|
32
|
-
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|
33
|
-
tf.get_logger().setLevel(logging.ERROR)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# Monkey patching deprecation utils to supress warnings.
|
|
37
|
-
# noinspection PyUnusedLocal
|
|
38
|
-
def deprecated(
|
|
39
|
-
date, instructions, warn_once=True
|
|
40
|
-
): # pylint: disable=unused-argument
|
|
41
|
-
def deprecated_wrapper(func):
|
|
42
|
-
return func
|
|
43
|
-
|
|
44
|
-
return deprecated_wrapper
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
deprecation.deprecated = deprecated
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class DisabledCV:
|
|
51
|
-
def __init__(self):
|
|
52
|
-
self.n_splits = 1
|
|
53
|
-
|
|
54
|
-
def split(self, X, y, groups=None):
|
|
55
|
-
yield (np.arange(len(X)), np.arange(len(y)))
|
|
56
|
-
|
|
57
|
-
def get_n_splits(self, X, y, groups=None):
|
|
58
|
-
return self.n_splits
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
# For VAE.
|
|
62
|
-
# Necessary to initialize outside of class for use with tf.function decorator.
|
|
63
|
-
cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
|
|
64
|
-
cca = tf.keras.metrics.CategoricalAccuracy()
|
|
65
|
-
ba = tf.keras.metrics.BinaryAccuracy()
|
|
66
|
-
bce = tf.keras.losses.BinaryCrossentropy()
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
class NeuralNetworkMethods:
|
|
70
|
-
"""Methods common to all neural network imputer classes and loss functions"""
|
|
71
|
-
|
|
72
|
-
def __init__(self):
|
|
73
|
-
self.data = None
|
|
74
|
-
|
|
75
|
-
@staticmethod
|
|
76
|
-
def encode_multilab(X, num_classes=4):
|
|
77
|
-
"""Encode 0-9 integer data in one-hot format.
|
|
78
|
-
Args:
|
|
79
|
-
X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
|
|
80
|
-
|
|
81
|
-
num_classes (int, optional): Number of multi-label classes to use. Mostly for compatibility with encode_multiclass. Defaults to 4.
|
|
82
|
-
Returns:
|
|
83
|
-
pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan). multi-label categories will be encoded as 0.5. Otherwise, it will be 1.0.
|
|
84
|
-
"""
|
|
85
|
-
# return np.where(X >= 0.5, 1.0, 0.0)
|
|
86
|
-
try:
|
|
87
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1], 4))
|
|
88
|
-
except IndexError:
|
|
89
|
-
Xt = np.zeros(shape=(X.shape[0],))
|
|
90
|
-
|
|
91
|
-
mappings = {
|
|
92
|
-
0: [1.0, 0.0, 0.0, 0.0],
|
|
93
|
-
1: [0.0, 1.0, 0.0, 0.0],
|
|
94
|
-
2: [0.0, 0.0, 1.0, 0.0],
|
|
95
|
-
3: [0.0, 0.0, 0.0, 1.0],
|
|
96
|
-
4: [1.0, 1.0, 0.0, 0.0],
|
|
97
|
-
5: [1.0, 0.0, 1.0, 0.0],
|
|
98
|
-
6: [1.0, 0.0, 0.0, 1.0],
|
|
99
|
-
7: [0.0, 1.0, 1.0, 0.0],
|
|
100
|
-
8: [0.0, 1.0, 0.0, 1.0],
|
|
101
|
-
9: [0.0, 0.0, 1.0, 1.0],
|
|
102
|
-
-9: [np.nan, np.nan, np.nan, np.nan],
|
|
103
|
-
}
|
|
104
|
-
try:
|
|
105
|
-
for row in np.arange(X.shape[0]):
|
|
106
|
-
Xt[row] = [mappings[enc] for enc in X[row]]
|
|
107
|
-
except TypeError:
|
|
108
|
-
Xt = [mappings[enc] for enc in X]
|
|
109
|
-
|
|
110
|
-
if not isinstance(Xt, np.ndarray):
|
|
111
|
-
Xt = np.array(Xt)
|
|
112
|
-
return Xt
|
|
113
|
-
|
|
114
|
-
@staticmethod
|
|
115
|
-
def encode_multiclass(X, num_classes=10, missing_value=-9):
|
|
116
|
-
"""Encode 0-9 integer data in multi-class one-hot format.
|
|
117
|
-
|
|
118
|
-
Missing values get encoded as ``[np.nan] * num_classes``
|
|
119
|
-
Args:
|
|
120
|
-
X (numpy.ndarray): Input array with 012-encoded data and ``missing_value`` as the missing data value.
|
|
121
|
-
|
|
122
|
-
num_classes (int, optional): Number of classes to use. Defaults to 10.
|
|
123
|
-
|
|
124
|
-
missing_value (int, optional): Missing data value to replace with ``[np.nan] * num_classes``\. Defaults to -9.
|
|
125
|
-
Returns:
|
|
126
|
-
pandas.DataFrame: Multi-class one-hot encoded data, ignoring missing values (np.nan).
|
|
127
|
-
"""
|
|
128
|
-
int_cats, ohe_arr = np.arange(num_classes), np.eye(num_classes)
|
|
129
|
-
mappings = dict(zip(int_cats, ohe_arr))
|
|
130
|
-
mappings[missing_value] = np.array([np.nan] * num_classes)
|
|
131
|
-
|
|
132
|
-
try:
|
|
133
|
-
Xt = np.zeros(shape=(X.shape[0], X.shape[1], num_classes))
|
|
134
|
-
except IndexError:
|
|
135
|
-
Xt = np.zeros(shape=(X.shape[0],))
|
|
136
|
-
|
|
137
|
-
try:
|
|
138
|
-
for row in np.arange(X.shape[0]):
|
|
139
|
-
Xt[row] = [mappings[enc] for enc in X[row]]
|
|
140
|
-
except TypeError:
|
|
141
|
-
Xt = [mappings[enc] for enc in X]
|
|
142
|
-
|
|
143
|
-
if not isinstance(Xt, np.ndarray):
|
|
144
|
-
Xt = np.array(Xt)
|
|
145
|
-
|
|
146
|
-
return Xt
|
|
147
|
-
|
|
148
|
-
@classmethod
|
|
149
|
-
def decode_masked(
|
|
150
|
-
cls,
|
|
151
|
-
y_true_bin,
|
|
152
|
-
y_pred_proba,
|
|
153
|
-
is_multiclass=True,
|
|
154
|
-
return_proba=False,
|
|
155
|
-
return_multilab=False,
|
|
156
|
-
return_int=True,
|
|
157
|
-
predict_still_missing=True,
|
|
158
|
-
threshold_increment=0.01,
|
|
159
|
-
multilabel_averaging="macro",
|
|
160
|
-
missing_mask=None,
|
|
161
|
-
):
|
|
162
|
-
"""Evaluate model predictions by decoding from one-hot encoding to integer-encoded format.
|
|
163
|
-
|
|
164
|
-
Gets the index of the highest predicted value to obtain the integer encodings or integer encodings.
|
|
165
|
-
|
|
166
|
-
Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
|
|
167
|
-
|
|
168
|
-
Args:
|
|
169
|
-
y_true_bin (numpy.ndarray): True multilabel target values of shape (n_samples * n_features, num_classes). Array should be flattened and masked.
|
|
170
|
-
|
|
171
|
-
y_pred_proba (numpy.ndarray): Multilabel model predictions of shape (n_samples * n_features, num_classes). Array should be flattened and masked.
|
|
172
|
-
|
|
173
|
-
is_multiclass (bool, optional): True if using multiclass data with softmax activation. False if using multilabel data with sigmoid activation. Defaults to True.
|
|
174
|
-
|
|
175
|
-
threshold (float, optional): If using multilabel, then set the threshold for determining 1 or 0 predictions. Defaults to 0.5.
|
|
176
|
-
|
|
177
|
-
return_proba (bool, optional): If True, returns probabilities for unresolved values where all multilabel probabilities were below the threshold. Defaults to False.
|
|
178
|
-
|
|
179
|
-
return_multilab (bool, optional): If True, returns the multilabel encodings instead of integer encodings (if doing multilabel classification). Defaults to False.
|
|
180
|
-
|
|
181
|
-
return_int (bool, optional): If True, returns the integer encodings instead of onehot encodings (if doing multiclass classification). Defaults to False.
|
|
182
|
-
|
|
183
|
-
predict_still_missing (bool, optional): If True, values that are still missing after decoding are decoded using the maximum probability (i.e., with np.argmax). If False, then it is possible that some missing data might still remain after decoding if none of the multilabel probabilities are above the threshold. Defaults to True.
|
|
184
|
-
|
|
185
|
-
threshold_increment (float, optional): How much to increment threshold when searching for optimal threshold. Should be > 0 and < 1. Defaults to 0.05.
|
|
186
|
-
|
|
187
|
-
multilabel_averaging (str): Method to use for averaging F1 score among multilabel classes. Supported options are: {"macro", "micro", "weighted", "samples"}. Defaults to "macro".
|
|
188
|
-
|
|
189
|
-
missing_mask (numpy.ndarray, optional): Missing mask with missing values encoded as 1's and nonmissing as 0. Only used if not None. Defaults to None.
|
|
190
|
-
|
|
191
|
-
Returns:
|
|
192
|
-
numpy.ndarray: Imputed integer-encoded values.
|
|
193
|
-
|
|
194
|
-
numpy.ndarray (optional): Probabilities for each call, with those above the threshold set to 1.0 and those below the threshold between 0 and 1.
|
|
195
|
-
"""
|
|
196
|
-
|
|
197
|
-
if return_int and return_multilab:
|
|
198
|
-
raise ValueError(
|
|
199
|
-
"return_int and return_multilab cannot both be True."
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
y_unresolved_certainty = None
|
|
203
|
-
if is_multiclass or y_true_bin.shape[-1] == 10:
|
|
204
|
-
# Softmax predictions.
|
|
205
|
-
# If reduce_dim is True, will return integer encodings.
|
|
206
|
-
# Otherwise, returns one-hot encodings.
|
|
207
|
-
y_pred = cls.decode_multiclass(y_pred_proba, reduce_dim=return_int)
|
|
208
|
-
else:
|
|
209
|
-
# Onehot encode if not already one-hot encoded.
|
|
210
|
-
if y_true_bin.shape[-1] != 4:
|
|
211
|
-
if y_true_bin.shape[-1] != 10:
|
|
212
|
-
y_true_bin = cls.encode_multilab(y_true_bin)
|
|
213
|
-
else:
|
|
214
|
-
y_true_bin = cls.encode_multiclass(y_true_bin)
|
|
215
|
-
|
|
216
|
-
pred_multilab = cls.zero_extra_categories(y_pred_proba)
|
|
217
|
-
|
|
218
|
-
# Binary multilabel predictions.
|
|
219
|
-
threshold = cls.get_optimal_threshold(
|
|
220
|
-
y_true_bin,
|
|
221
|
-
pred_multilab,
|
|
222
|
-
increment=threshold_increment,
|
|
223
|
-
average_method=multilabel_averaging,
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
# Call 0s and 1s based on threshold.
|
|
227
|
-
pred_multilab = np.where(pred_multilab >= threshold, 1.0, 0.0)
|
|
228
|
-
|
|
229
|
-
pred_multilab_decoded = cls.decode_binary_multilab(pred_multilab)
|
|
230
|
-
|
|
231
|
-
if predict_still_missing:
|
|
232
|
-
# Check if there are still any missing values.
|
|
233
|
-
still_missing = np.all(pred_multilab == 0, axis=-1)
|
|
234
|
-
|
|
235
|
-
if return_multilab:
|
|
236
|
-
still_missing_bin = np.all(
|
|
237
|
-
pred_multilab == 0, axis=-1, keepdims=True
|
|
238
|
-
)
|
|
239
|
-
|
|
240
|
-
# Do multiclass prediction with argmax then get the probabilities
|
|
241
|
-
# if any unresolved values.
|
|
242
|
-
if np.any(still_missing):
|
|
243
|
-
# Get the argmax with the highest probability if
|
|
244
|
-
# all classes are below threshold.
|
|
245
|
-
y_multi = cls.decode_multiclass(y_pred_proba)
|
|
246
|
-
y_multi_bin = cls.decode_multiclass(
|
|
247
|
-
y_pred_proba, reduce_dim=False
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
try:
|
|
251
|
-
y_pred = np.where(
|
|
252
|
-
still_missing, y_multi, pred_multilab_decoded
|
|
253
|
-
)
|
|
254
|
-
except ValueError:
|
|
255
|
-
y_pred = np.where(
|
|
256
|
-
still_missing,
|
|
257
|
-
y_multi,
|
|
258
|
-
np.reshape(
|
|
259
|
-
pred_multilab_decoded, still_missing.shape
|
|
260
|
-
),
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
if return_multilab:
|
|
264
|
-
y_pred_bin = np.where(
|
|
265
|
-
still_missing_bin, y_multi_bin, pred_multilab
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
y_pred = y_pred_bin
|
|
269
|
-
|
|
270
|
-
if return_proba:
|
|
271
|
-
# Get max value as base call.
|
|
272
|
-
y_pred_proba_max = y_pred_proba.max(axis=-1)
|
|
273
|
-
|
|
274
|
-
# Get probability of max value that was < threshold.
|
|
275
|
-
y_unresolved_certainty = np.where(
|
|
276
|
-
still_missing, y_pred_proba_max, 1.0
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
else:
|
|
280
|
-
if return_multilab:
|
|
281
|
-
y_pred = pred_multilab
|
|
282
|
-
else:
|
|
283
|
-
y_pred = pred_multilab_decoded
|
|
284
|
-
else:
|
|
285
|
-
if return_multilab:
|
|
286
|
-
y_pred = pred_multilab
|
|
287
|
-
else:
|
|
288
|
-
y_pred = pred_multilab_decoded
|
|
289
|
-
|
|
290
|
-
y_pred = y_pred.astype(int)
|
|
291
|
-
|
|
292
|
-
if return_proba:
|
|
293
|
-
return y_pred, y_unresolved_certainty
|
|
294
|
-
else:
|
|
295
|
-
return y_pred
|
|
296
|
-
|
|
297
|
-
@classmethod
|
|
298
|
-
def get_optimal_threshold(
|
|
299
|
-
cls,
|
|
300
|
-
y_true_bin,
|
|
301
|
-
y_pred_proba,
|
|
302
|
-
increment=0.01,
|
|
303
|
-
average_method="macro",
|
|
304
|
-
):
|
|
305
|
-
"""Increment to find the optimal decoding threshold.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
y_true_bin (numpy.ndarray): True multilabel values of shape (n_samples * n_features, num_classes).
|
|
309
|
-
|
|
310
|
-
y_pred_proba (numpy.ndarray): Multilabel prediction probabilities of shape (n_features * n_samples, num_classes).
|
|
311
|
-
|
|
312
|
-
increment (float, optional): How much to increment when searching for optimal threshold. Should be > 0 and < 1. Defaults to 0.1.
|
|
313
|
-
|
|
314
|
-
average_method (str, optional): Method to use for averaging the F1 score across multilabel classes. Possible options include {"macro", "micro", "weighted", "samples"}. Defaults to "macro".
|
|
315
|
-
|
|
316
|
-
Returns:
|
|
317
|
-
float: Optimal decoding threshold.
|
|
318
|
-
"""
|
|
319
|
-
y_true = y_true_bin.copy()
|
|
320
|
-
y_pred = y_pred_proba.copy()
|
|
321
|
-
|
|
322
|
-
thresholds = np.arange(increment, 1, increment)
|
|
323
|
-
|
|
324
|
-
nonmissing_mask = np.where(y_true_bin != -1)
|
|
325
|
-
num_classes = y_true_bin.shape[-1]
|
|
326
|
-
|
|
327
|
-
# This is only supposed to get applied during the final transform,
|
|
328
|
-
# when the original missing data is replaced with predictions.
|
|
329
|
-
# If this isn't done here, it ends up having -1 values in it,
|
|
330
|
-
# which causes the f1_score function to throw an error.
|
|
331
|
-
|
|
332
|
-
try:
|
|
333
|
-
y_true = y_true[nonmissing_mask]
|
|
334
|
-
y_pred = y_pred[nonmissing_mask]
|
|
335
|
-
except IndexError:
|
|
336
|
-
pass
|
|
337
|
-
|
|
338
|
-
# Call 0s and 1s based on threshold.
|
|
339
|
-
|
|
340
|
-
scores = list()
|
|
341
|
-
for t in thresholds:
|
|
342
|
-
pred_multilab = np.where(y_pred >= t, 1.0, 0.0)
|
|
343
|
-
pred_multilab_decoded = cls.decode_binary_multilab(pred_multilab)
|
|
344
|
-
true_multilab_decoded = cls.decode_binary_multilab(y_true)
|
|
345
|
-
|
|
346
|
-
# Had to cast them as integers to get rid of a type error during the
|
|
347
|
-
# final transform() function.
|
|
348
|
-
|
|
349
|
-
scores.append(
|
|
350
|
-
f1_score(
|
|
351
|
-
true_multilab_decoded,
|
|
352
|
-
pred_multilab_decoded,
|
|
353
|
-
average="weighted",
|
|
354
|
-
)
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
return thresholds[np.argmax(scores)]
|
|
358
|
-
|
|
359
|
-
@classmethod
|
|
360
|
-
def flatten_bin_encodings(cls, y):
|
|
361
|
-
"""Flatten first two dimensions of binary encodings to (num_samples * num_features, num_classes).
|
|
362
|
-
|
|
363
|
-
Args:
|
|
364
|
-
y (numpy.ndarray): Numpy array with 3-dimensional shape of (n_samples, num_features, num_classes).
|
|
365
|
-
|
|
366
|
-
Returns:
|
|
367
|
-
numpy.ndarray: Array of shape (n_samples * num_features, num_classes).
|
|
368
|
-
|
|
369
|
-
Raises:
|
|
370
|
-
ValueError: Input shape must be 3-dimensional.
|
|
371
|
-
"""
|
|
372
|
-
if len(y.shape) != 3:
|
|
373
|
-
raise ValueError("Input array must be 3-dimensional")
|
|
374
|
-
|
|
375
|
-
return y.reshape(y.shape[0] * y.shape[1], y.shape[2])
|
|
376
|
-
|
|
377
|
-
@staticmethod
|
|
378
|
-
def zero_extra_categories(y_pred_proba, threshold=0.5):
|
|
379
|
-
"""Check if any prediction probabilities have >2 values above threshold.
|
|
380
|
-
|
|
381
|
-
If >2, then it sets the two with the lowest probabilities to 0.0.
|
|
382
|
-
|
|
383
|
-
Args:
|
|
384
|
-
y_pred_proba (numpy.ndarray): Prediction probabilities (sigmoid activation) of shape (n_samples, n_features, num_classes) or (n_samples * n_features, num_classes).
|
|
385
|
-
|
|
386
|
-
pred_multilab (numpy.ndarray): Multi-label decodings. Inner arrays should have only 0s and 1s. Should be of shape (n_samples, n_features, num_classes) or (n_samples * n_features, num_classes).
|
|
387
|
-
|
|
388
|
-
threshold (float, optional): Threshold to use to set decoded multilabel values to 0s (< threshold) or 1s (>= threshold). Defaults to 0.5.
|
|
389
|
-
"""
|
|
390
|
-
N = 2
|
|
391
|
-
y_pred_proba[y_pred_proba.argsort().argsort() < N] = 0.0
|
|
392
|
-
return y_pred_proba
|
|
393
|
-
# idx = np.argpartition(y_pred_proba.ravel(), k)
|
|
394
|
-
# indices = tuple(
|
|
395
|
-
# np.array(np.unravel_index(idx, y_pred_proba.shape))[
|
|
396
|
-
# :, range(min(k, 0), max(k, 0))
|
|
397
|
-
# ]
|
|
398
|
-
# )
|
|
399
|
-
|
|
400
|
-
# y_pred_proba[indices] = 0.0
|
|
401
|
-
# return y_pred_proba
|
|
402
|
-
|
|
403
|
-
# return np.where(y_pred_proba >= threshold, 1.0, 0.0)
|
|
404
|
-
|
|
405
|
-
@classmethod
|
|
406
|
-
def decode_multiclass(cls, y_pred_proba, reduce_dim=True):
|
|
407
|
-
"""Decode probabilities to either one-hot or integer encodings.
|
|
408
|
-
|
|
409
|
-
Args:
|
|
410
|
-
y_pred_proba (numpy.ndarray): Probabilities to decode.
|
|
411
|
-
|
|
412
|
-
reduce_dim (bool, optional): If True, returns integer encodings of one fewer dimension than ``y_pred_proba``\. Otherwise, returns one-hot encodings where the class with the maximum probability is a 1 and every other class is 0. Defaults to True.
|
|
413
|
-
|
|
414
|
-
Returns:
|
|
415
|
-
numpy.ndarray: Integer or one-hot-encoded predictions.
|
|
416
|
-
"""
|
|
417
|
-
yt = np.apply_along_axis(cls.mle, axis=-1, arr=y_pred_proba)
|
|
418
|
-
if reduce_dim:
|
|
419
|
-
return np.argmax(yt, axis=-1)
|
|
420
|
-
else:
|
|
421
|
-
return yt
|
|
422
|
-
|
|
423
|
-
@classmethod
|
|
424
|
-
def decode_binary_multilab(cls, y_pred):
|
|
425
|
-
"""Decode multi-label sigmoid probabilities to integer encodings.
|
|
426
|
-
|
|
427
|
-
The predictions should have already undergone sigmoid activation and should be probabilities.
|
|
428
|
-
|
|
429
|
-
If sigmoid activation output is >0.5, gets encoded as 1.0; else 0.0. If more than one category is > 0.5, then it is a heterozygote.
|
|
430
|
-
|
|
431
|
-
Args:
|
|
432
|
-
y_pred (numpy.ndarray): Model predictions of shape (n_samples * n_features, num_classes) or (n_samples, n_features, num_classes). A threshold should already have been applied to set each class to 0 or 1.
|
|
433
|
-
|
|
434
|
-
Returns:
|
|
435
|
-
numpy.ndarray: Integer-decoded multilabel predictions of shape (n_samples * n_features) or (n_samples, n_features).
|
|
436
|
-
"""
|
|
437
|
-
y_pred_idx = y_pred.astype(int)
|
|
438
|
-
y_pred_idx = y_pred_idx.astype(str)
|
|
439
|
-
|
|
440
|
-
if len(y_pred_idx.shape) < 3:
|
|
441
|
-
y_pred_idx = np.array(
|
|
442
|
-
[
|
|
443
|
-
"".join(np.atleast_1d(row == "1").nonzero()[0].astype(str))
|
|
444
|
-
for row in y_pred_idx
|
|
445
|
-
]
|
|
446
|
-
)
|
|
447
|
-
else:
|
|
448
|
-
y_pred_idx = np.array(
|
|
449
|
-
[
|
|
450
|
-
"".join(np.atleast_1d(col == "1").nonzero()[0].astype(str))
|
|
451
|
-
for row in y_pred_idx
|
|
452
|
-
for col in row
|
|
453
|
-
]
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
try:
|
|
457
|
-
Xt = np.zeros(shape=(y_pred.shape[0], y_pred.shape[1], 4))
|
|
458
|
-
except IndexError:
|
|
459
|
-
Xt = np.zeros(shape=(y_pred.shape[0],))
|
|
460
|
-
|
|
461
|
-
mappings = {
|
|
462
|
-
"0": 0,
|
|
463
|
-
"1": 1,
|
|
464
|
-
"2": 2,
|
|
465
|
-
"3": 3,
|
|
466
|
-
"01": 4,
|
|
467
|
-
"02": 5,
|
|
468
|
-
"03": 6,
|
|
469
|
-
"12": 7,
|
|
470
|
-
"13": 8,
|
|
471
|
-
"23": 9,
|
|
472
|
-
"-9": -9,
|
|
473
|
-
"": -9,
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
Xt = [mappings[enc] for enc in y_pred_idx]
|
|
477
|
-
|
|
478
|
-
if not isinstance(Xt, np.ndarray):
|
|
479
|
-
Xt = np.array(Xt)
|
|
480
|
-
return Xt
|
|
481
|
-
|
|
482
|
-
@staticmethod
|
|
483
|
-
def encode_categorical(X):
|
|
484
|
-
"""Encode -9 encoded missing values as np.nan.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
X (numpy.ndarray): 012-encoded genotypes with -9 as missing values.
|
|
488
|
-
|
|
489
|
-
Returns:
|
|
490
|
-
pandas.DataFrame: DataFrame with missing values encoded as np.nan.
|
|
491
|
-
"""
|
|
492
|
-
np.nan_to_num(X, copy=False, nan=-9.0)
|
|
493
|
-
X = X.astype(str)
|
|
494
|
-
X[(X == "-9.0") | (X == "-9")] = "none"
|
|
495
|
-
|
|
496
|
-
df = pd.DataFrame(X)
|
|
497
|
-
df_incomplete = df.copy()
|
|
498
|
-
|
|
499
|
-
# Replace 'none' with np.nan
|
|
500
|
-
for row in df.index:
|
|
501
|
-
for col in df.columns:
|
|
502
|
-
if df_incomplete.iat[row, col] == "none":
|
|
503
|
-
df_incomplete.iat[row, col] = np.nan
|
|
504
|
-
|
|
505
|
-
return df_incomplete
|
|
506
|
-
|
|
507
|
-
@staticmethod
|
|
508
|
-
def mle(row):
|
|
509
|
-
"""Get the Maximum Likelihood Estimation for the best prediction. Basically, it sets the index of the maxiumum value in a vector (row) to 1.0, since it is one-hot encoded.
|
|
510
|
-
|
|
511
|
-
Args:
|
|
512
|
-
row (numpy.ndarray(float)): Row vector with predicted values as floating points.
|
|
513
|
-
|
|
514
|
-
Returns:
|
|
515
|
-
numpy.ndarray(float): Row vector with the highest prediction set to 1.0 and the others set to 0.0.
|
|
516
|
-
"""
|
|
517
|
-
res = np.zeros(row.shape[0])
|
|
518
|
-
res[np.argmax(row)] = 1
|
|
519
|
-
return res
|
|
520
|
-
|
|
521
|
-
@classmethod
|
|
522
|
-
def predict(cls, X, complete_encoded):
|
|
523
|
-
"""Evaluate VAE predictions by calculating the highest predicted value.
|
|
524
|
-
|
|
525
|
-
Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
|
|
526
|
-
|
|
527
|
-
Args:
|
|
528
|
-
X (numpy.ndarray): Input 012-encoded data.
|
|
529
|
-
|
|
530
|
-
complete_encoded (numpy.ndarray): Output one-hot encoded data with the maximum predicted values for each class set to 1.0.
|
|
531
|
-
|
|
532
|
-
Returns:
|
|
533
|
-
numpy.ndarray: Imputed one-hot encoded values.
|
|
534
|
-
|
|
535
|
-
pandas.DataFrame: One-hot encoded pandas DataFrame with no missing values.
|
|
536
|
-
"""
|
|
537
|
-
|
|
538
|
-
df = cls.encode_categorical(X)
|
|
539
|
-
|
|
540
|
-
# Had to add dropna() to count unique classes while ignoring np.nan
|
|
541
|
-
col_classes = [len(df[c].dropna().unique()) for c in df.columns]
|
|
542
|
-
df_dummies = pd.get_dummies(df)
|
|
543
|
-
mle_complete = None
|
|
544
|
-
for i, cnt in enumerate(col_classes):
|
|
545
|
-
start_idx = int(sum(col_classes[0:i]))
|
|
546
|
-
col_completed = complete_encoded[:, start_idx : start_idx + cnt]
|
|
547
|
-
mle_completed = np.apply_along_axis(
|
|
548
|
-
cls.mle, axis=1, arr=col_completed
|
|
549
|
-
)
|
|
550
|
-
|
|
551
|
-
if mle_complete is None:
|
|
552
|
-
mle_complete = mle_completed
|
|
553
|
-
|
|
554
|
-
else:
|
|
555
|
-
mle_complete = np.hstack([mle_complete, mle_completed])
|
|
556
|
-
return mle_complete, df_dummies
|
|
557
|
-
|
|
558
|
-
def validate_hidden_layers(self, hidden_layer_sizes, num_hidden_layers):
|
|
559
|
-
"""Validate hidden_layer_sizes and verify that it is in the correct format.
|
|
560
|
-
|
|
561
|
-
Args:
|
|
562
|
-
hidden_layer_sizes (str, int, List[str], or List[int]): Output units for all the hidden layers.
|
|
563
|
-
|
|
564
|
-
num_hidden_layers (int): Number of hidden layers to use.
|
|
565
|
-
|
|
566
|
-
Returns:
|
|
567
|
-
List[int] or List[str]: List of hidden layer sizes.
|
|
568
|
-
"""
|
|
569
|
-
if isinstance(hidden_layer_sizes, (str, int)):
|
|
570
|
-
hidden_layer_sizes = [hidden_layer_sizes] * num_hidden_layers
|
|
571
|
-
|
|
572
|
-
# If not all integers
|
|
573
|
-
elif isinstance(hidden_layer_sizes, list):
|
|
574
|
-
if not all(
|
|
575
|
-
[isinstance(x, (str, int)) for x in hidden_layer_sizes]
|
|
576
|
-
):
|
|
577
|
-
ls = list(set([type(item) for item in hidden_layer_sizes]))
|
|
578
|
-
raise TypeError(
|
|
579
|
-
f"Variable hidden_layer_sizes must either be None, "
|
|
580
|
-
f"an integer or string, or a list of integers or "
|
|
581
|
-
f"strings, but got the following type(s): {ls}"
|
|
582
|
-
)
|
|
583
|
-
|
|
584
|
-
else:
|
|
585
|
-
raise TypeError(
|
|
586
|
-
f"Variable hidden_layer_sizes must either be, "
|
|
587
|
-
f"an integer, a string, or a list of integers or strings, "
|
|
588
|
-
f"but got the following type: {type(hidden_layer_sizes)}"
|
|
589
|
-
)
|
|
590
|
-
|
|
591
|
-
assert (
|
|
592
|
-
num_hidden_layers == len(hidden_layer_sizes)
|
|
593
|
-
and num_hidden_layers > 0
|
|
594
|
-
), "num_hidden_layers must be the length of hidden_layer_sizes."
|
|
595
|
-
|
|
596
|
-
return hidden_layer_sizes
|
|
597
|
-
|
|
598
|
-
def get_hidden_layer_sizes(self, n_dims, n_components, hl_func, vae=False):
|
|
599
|
-
"""Get dimensions of hidden layers.
|
|
600
|
-
|
|
601
|
-
Args:
|
|
602
|
-
n_dims (int): The number of feature dimensions (columns) (d).
|
|
603
|
-
|
|
604
|
-
n_components (int): The number of reduced dimensions (t).
|
|
605
|
-
|
|
606
|
-
hl_func (str): The function to use to calculate the hidden layer sizes. Possible options: "midpoint", "sqrt", "log2".
|
|
607
|
-
|
|
608
|
-
vae (bool, optional): Whether using the VAE algorithm. If False, then the returned list gets reversed for NLPCA and UBP.
|
|
609
|
-
|
|
610
|
-
Returns:
|
|
611
|
-
[int, int, int, ...]: [Number of dimensions in hidden layers].
|
|
612
|
-
|
|
613
|
-
Raises:
|
|
614
|
-
ValueError: Too many hidden layers specified. Repeated reduction of layer sizes dips below n_components.
|
|
615
|
-
"""
|
|
616
|
-
layers = list()
|
|
617
|
-
if not isinstance(hl_func, list):
|
|
618
|
-
raise TypeError(
|
|
619
|
-
f"hl_func must be of type list, but got {type(hl_func)}."
|
|
620
|
-
)
|
|
621
|
-
|
|
622
|
-
units = n_dims
|
|
623
|
-
for func in hl_func:
|
|
624
|
-
if func == "midpoint":
|
|
625
|
-
units = round((units + n_components) / 2)
|
|
626
|
-
elif func == "sqrt":
|
|
627
|
-
units = round(math.sqrt(units))
|
|
628
|
-
elif func == "log2":
|
|
629
|
-
units = round(math.log(units, 2))
|
|
630
|
-
elif isinstance(func, int):
|
|
631
|
-
units = func
|
|
632
|
-
else:
|
|
633
|
-
raise ValueError(
|
|
634
|
-
f"hidden_layer_sizes must be either integers or any of "
|
|
635
|
-
f"the following strings: 'midpoint', "
|
|
636
|
-
f"'sqrt', or 'log2', but got {func} of type {type(func)}"
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
if units <= n_components:
|
|
640
|
-
print(
|
|
641
|
-
f"WARNING: hidden_layer_size reduction became less than n_components. Using only {len(layers)} hidden layers."
|
|
642
|
-
)
|
|
643
|
-
break
|
|
644
|
-
|
|
645
|
-
assert units > 0 and units < n_dims, (
|
|
646
|
-
f"The hidden layer sizes must be > 0 and < the number of "
|
|
647
|
-
f"features (i.e., columns) in the dataset, but size was {units}"
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
layers.append(units)
|
|
651
|
-
|
|
652
|
-
assert (
|
|
653
|
-
layers
|
|
654
|
-
), "There was an error setting hidden layer sizes. Size list is empty. It is possible that the first 'sqrt' reduction caused units to be <= n_components."
|
|
655
|
-
|
|
656
|
-
if not vae:
|
|
657
|
-
layers.reverse()
|
|
658
|
-
|
|
659
|
-
return layers
|
|
660
|
-
|
|
661
|
-
def validate_model_inputs(self, y, missing_mask, output_shape):
|
|
662
|
-
"""Validate inputs to Keras subclass model.
|
|
663
|
-
|
|
664
|
-
Args:
|
|
665
|
-
V (numpy.ndarray): Input to refine. Shape: (n_samples, n_components).
|
|
666
|
-
y (numpy.ndarray): Target (but actual input data). Shape: (n_samples, n_features).
|
|
667
|
-
|
|
668
|
-
y_test (numpy.ndarray): Target test dataset. Should have been imputed with simple imputer and missing data simulated using SimGenotypeData(). Shape: (n_samples, n_features).
|
|
669
|
-
|
|
670
|
-
missing_mask (numpy.ndarray): Missing data mask for y.
|
|
671
|
-
|
|
672
|
-
missing_mask_test (numpy.ndarray): Missing data mask for y_test.
|
|
673
|
-
|
|
674
|
-
output_shape (int): Output shape for hidden layers.
|
|
675
|
-
|
|
676
|
-
Raises:
|
|
677
|
-
TypeError: V, y, missing_mask, output_shape must not be NoneType.
|
|
678
|
-
"""
|
|
679
|
-
if y is None:
|
|
680
|
-
raise TypeError("y must not be NoneType.")
|
|
681
|
-
|
|
682
|
-
if missing_mask is None:
|
|
683
|
-
raise TypeError("missing_mask must not be NoneType.")
|
|
684
|
-
|
|
685
|
-
if output_shape is None:
|
|
686
|
-
raise TypeError("output_shape must not be NoneType.")
|
|
687
|
-
|
|
688
|
-
def prepare_training_batches(
|
|
689
|
-
self,
|
|
690
|
-
V,
|
|
691
|
-
y,
|
|
692
|
-
batch_size,
|
|
693
|
-
batch_idx,
|
|
694
|
-
trainable,
|
|
695
|
-
n_components,
|
|
696
|
-
sample_weight,
|
|
697
|
-
missing_mask,
|
|
698
|
-
ubp=True,
|
|
699
|
-
):
|
|
700
|
-
"""Prepare training batches in the custom training loop.
|
|
701
|
-
|
|
702
|
-
Args:
|
|
703
|
-
V (numpy.ndarray): Input to batch subset and refine, of shape (n_samples, n_components) (if doing UBP/NLPCA) or (n_samples, n_features) (if doing VAE).
|
|
704
|
-
|
|
705
|
-
y (numpy.ndarray): Target to use to refine input V. shape (n_samples, n_features).
|
|
706
|
-
|
|
707
|
-
batch_size (int): Batch size to subset.
|
|
708
|
-
|
|
709
|
-
batch_idx (int): Current batch index.
|
|
710
|
-
|
|
711
|
-
trainable (bool): Whether tensor v should be trainable.
|
|
712
|
-
|
|
713
|
-
n_components (int): Number of principal components used in V.
|
|
714
|
-
|
|
715
|
-
sample_weight (List[float] or None): List of floats of shape (n_samples,) with sample weights. sample_weight argument must be passed to fit().
|
|
716
|
-
|
|
717
|
-
missing_mask (numpy.ndarray): Boolean array with True for missing values and False for observed values.
|
|
718
|
-
|
|
719
|
-
ubp (bool, optional): Whether model is UBP/NLPCA (if True) or VAE (if False). Defaults to True.
|
|
720
|
-
|
|
721
|
-
Returns:
|
|
722
|
-
tf.Variable: Input tensor v with current batch assigned.
|
|
723
|
-
numpy.ndarray: Current batch of target data (actual input) used to refine v.
|
|
724
|
-
List[float]: Sample weights
|
|
725
|
-
int: Batch starting index.
|
|
726
|
-
int: Batch ending index.
|
|
727
|
-
numpy.ndarray: Batch of y_train target data of shape (batch_size, n_features, n_classes). Only returned for VAE.
|
|
728
|
-
"""
|
|
729
|
-
# on_train_batch_begin() method.
|
|
730
|
-
n_samples = y.shape[0]
|
|
731
|
-
|
|
732
|
-
# Get current batch size and range.
|
|
733
|
-
# self._batch_idx is set in the UBPCallbacks() callback
|
|
734
|
-
batch_start = batch_idx * batch_size
|
|
735
|
-
batch_end = (batch_idx + 1) * batch_size
|
|
736
|
-
if batch_end > n_samples:
|
|
737
|
-
batch_end = n_samples - 1
|
|
738
|
-
batch_size = batch_end - batch_start
|
|
739
|
-
|
|
740
|
-
if ubp:
|
|
741
|
-
# override batches. This model refines the input to fit the output, so
|
|
742
|
-
# v_batch and y_true have to be overridden.
|
|
743
|
-
y_true = y[batch_start:batch_end, :]
|
|
744
|
-
|
|
745
|
-
v_batch = V[batch_start:batch_end, :]
|
|
746
|
-
missing_mask_batch = missing_mask[batch_start:batch_end, :]
|
|
747
|
-
|
|
748
|
-
if sample_weight is not None:
|
|
749
|
-
sample_weight_batch = sample_weight[batch_start:batch_end, :]
|
|
750
|
-
else:
|
|
751
|
-
sample_weight_batch = None
|
|
752
|
-
|
|
753
|
-
v = tf.Variable(
|
|
754
|
-
tf.zeros([batch_size, n_components]),
|
|
755
|
-
trainable=trainable,
|
|
756
|
-
dtype=tf.float32,
|
|
757
|
-
)
|
|
758
|
-
|
|
759
|
-
# Assign current batch to tf.Variable v.
|
|
760
|
-
v.assign(v_batch)
|
|
761
|
-
|
|
762
|
-
return (
|
|
763
|
-
v,
|
|
764
|
-
y_true,
|
|
765
|
-
sample_weight_batch,
|
|
766
|
-
missing_mask_batch,
|
|
767
|
-
batch_start,
|
|
768
|
-
batch_end,
|
|
769
|
-
)
|
|
770
|
-
|
|
771
|
-
else:
|
|
772
|
-
# Using VAE.
|
|
773
|
-
y_true = y[batch_start:batch_end, :]
|
|
774
|
-
v = V[batch_start:batch_end, :]
|
|
775
|
-
missing_mask_batch = missing_mask[batch_start:batch_end, :]
|
|
776
|
-
|
|
777
|
-
if sample_weight is not None:
|
|
778
|
-
sample_weight_batch = sample_weight[batch_start:batch_end, :]
|
|
779
|
-
else:
|
|
780
|
-
sample_weight_batch = None
|
|
781
|
-
|
|
782
|
-
return (
|
|
783
|
-
y_true,
|
|
784
|
-
sample_weight_batch,
|
|
785
|
-
missing_mask_batch,
|
|
786
|
-
)
|
|
787
|
-
|
|
788
|
-
def validate_batch_size(self, X, batch_size):
|
|
789
|
-
"""Validate the batch size, and adjust as necessary.
|
|
790
|
-
|
|
791
|
-
If the specified batch_size is greater than the number of samples in the input data, it will divide batch_size by 2 until it is less than n_samples.
|
|
792
|
-
|
|
793
|
-
Args:
|
|
794
|
-
X (numpy.ndarray): Input data of shape (n_samples, n_features).
|
|
795
|
-
batch_size (int): Batch size to use.
|
|
796
|
-
|
|
797
|
-
Returns:
|
|
798
|
-
int: Batch size (adjusted if necessary).
|
|
799
|
-
"""
|
|
800
|
-
if batch_size > X.shape[0]:
|
|
801
|
-
while batch_size > X.shape[0]:
|
|
802
|
-
print(
|
|
803
|
-
"Batch size is larger than the number of samples. "
|
|
804
|
-
"Dividing batch_size by 2."
|
|
805
|
-
)
|
|
806
|
-
batch_size //= 2
|
|
807
|
-
return batch_size
|
|
808
|
-
|
|
809
|
-
def set_compile_params(
|
|
810
|
-
self, optimizer, sample_weights=None, vae=False, act_func="softmax"
|
|
811
|
-
):
|
|
812
|
-
"""Set compile parameters to use.
|
|
813
|
-
|
|
814
|
-
Args:
|
|
815
|
-
optimizer (str): Keras optimizer to use. Possible options include: {"adam", "sgd", "adagrad", "adadelta", "adamax", "ftrl", "nadam", "rmsprop"}.
|
|
816
|
-
|
|
817
|
-
sample_weights (numpy.ndarray, optional): Sample weight matrix of shape (n_samples, n_features). Defaults to None.
|
|
818
|
-
|
|
819
|
-
vae (bool, optional): Whether using the VAE model. Defaults to False.
|
|
820
|
-
|
|
821
|
-
act_func (str, optional): Activation function to use. Should be "softmax" if doing multiclass classification, otherwise "sigmoid".
|
|
822
|
-
|
|
823
|
-
Returns:
|
|
824
|
-
Dict[str, callable] or Dict[str, Any]: Callables if search_mode is True, otherwise instantiated objects.
|
|
825
|
-
|
|
826
|
-
Raises:
|
|
827
|
-
ValueError: Unsupported optimizer specified.
|
|
828
|
-
ValueError: Invalid act_func argument supplied.
|
|
829
|
-
"""
|
|
830
|
-
if optimizer.lower() == "adam":
|
|
831
|
-
opt = tf.keras.optimizers.legacy.Adam
|
|
832
|
-
elif optimizer.lower() == "sgd":
|
|
833
|
-
opt = tf.keras.optimizers.legacy.SGD
|
|
834
|
-
elif optimizer.lower() == "adagrad":
|
|
835
|
-
opt = tf.keras.optimizers.legacy.Adagrad
|
|
836
|
-
elif optimizer.lower() == "adadelta":
|
|
837
|
-
opt = tf.keras.optimizers.legacy.Adadelta
|
|
838
|
-
elif optimizer.lower() == "adamax":
|
|
839
|
-
opt = tf.keras.optimizers.legacy.Adamax
|
|
840
|
-
elif optimizer.lower() == "ftrl":
|
|
841
|
-
opt = tf.keras.optimizers.legacy.Ftrl
|
|
842
|
-
elif optimizer.lower() == "nadam":
|
|
843
|
-
opt = tf.keras.optimizers.legacy.Nadam
|
|
844
|
-
elif optimizer.lower() == "rmsprop":
|
|
845
|
-
opt = tf.keras.optimizers.legacy.RMSProp
|
|
846
|
-
|
|
847
|
-
if vae:
|
|
848
|
-
if act_func == "softmax":
|
|
849
|
-
loss_func = (
|
|
850
|
-
NeuralNetworkMethods.make_masked_categorical_crossentropy
|
|
851
|
-
)
|
|
852
|
-
elif act_func == "sigmoid":
|
|
853
|
-
loss_func = (
|
|
854
|
-
NeuralNetworkMethods.make_masked_binary_crossentropy
|
|
855
|
-
)
|
|
856
|
-
else:
|
|
857
|
-
raise ValueError(
|
|
858
|
-
f"act_func must be either 'softmax' or 'sigmoid', but got {act_func}"
|
|
859
|
-
)
|
|
860
|
-
|
|
861
|
-
loss = loss_func()
|
|
862
|
-
metrics = None
|
|
863
|
-
|
|
864
|
-
else:
|
|
865
|
-
# Doing grid search. Params are callables.
|
|
866
|
-
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
|
|
867
|
-
metrics = [tf.keras.metrics.CategoricalAccuracy()]
|
|
868
|
-
|
|
869
|
-
return {
|
|
870
|
-
"optimizer": opt,
|
|
871
|
-
"loss": loss,
|
|
872
|
-
"metrics": metrics,
|
|
873
|
-
"run_eagerly": False,
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
@staticmethod
|
|
877
|
-
def init_weights(dim1, dim2, w_mean=0, w_stddev=0.01):
|
|
878
|
-
"""Initialize random weights to use with the model.
|
|
879
|
-
|
|
880
|
-
Args:
|
|
881
|
-
dim1 (int): Size of first dimension.
|
|
882
|
-
|
|
883
|
-
dim2 (int): Size of second dimension.
|
|
884
|
-
|
|
885
|
-
w_mean (float, optional): Mean of normal distribution. Defaults to 0.
|
|
886
|
-
|
|
887
|
-
w_stddev (float, optional): Standard deviation of normal distribution. Defaults to 0.01.
|
|
888
|
-
"""
|
|
889
|
-
# Get reduced-dimension dataset.
|
|
890
|
-
return np.random.normal(loc=w_mean, scale=w_stddev, size=(dim1, dim2))
|
|
891
|
-
|
|
892
|
-
def reset_seeds(self):
|
|
893
|
-
"""Reset random seeds for initializing weights."""
|
|
894
|
-
seed1 = np.random.randint(1, 1e6)
|
|
895
|
-
seed2 = np.random.randint(1, 1e6)
|
|
896
|
-
seed3 = np.random.randint(1, 1e6)
|
|
897
|
-
np.random.seed(seed1)
|
|
898
|
-
random.seed(seed2)
|
|
899
|
-
if tf.__version__[0] == "2":
|
|
900
|
-
tf.random.set_seed(seed3)
|
|
901
|
-
else:
|
|
902
|
-
tf.set_random_seed(seed3)
|
|
903
|
-
|
|
904
|
-
@staticmethod
|
|
905
|
-
def masked_mse(self, X_true, X_pred, mask):
|
|
906
|
-
"""Calculates mean squared error with missing values ignored.
|
|
907
|
-
|
|
908
|
-
Args:
|
|
909
|
-
X_true (numpy.ndarray): One-hot encoded input data.
|
|
910
|
-
X_pred (numpy.ndarray): Predicted values.
|
|
911
|
-
mask (numpy.ndarray): One-hot encoded missing data mask.
|
|
912
|
-
|
|
913
|
-
Returns:
|
|
914
|
-
float: Mean squared error calculation.
|
|
915
|
-
"""
|
|
916
|
-
return np.square(np.subtract(X_true[mask], X_pred[mask])).mean()
|
|
917
|
-
|
|
918
|
-
@staticmethod
|
|
919
|
-
def make_masked_binary_accuracy(class_weight=None, is_vae=True):
|
|
920
|
-
"""Make binary accuracy metric with missing mask.
|
|
921
|
-
|
|
922
|
-
Args:
|
|
923
|
-
class_weight (Dict[int, float], optional): Class weights to reduce class imbalance. Defaults to None.
|
|
924
|
-
|
|
925
|
-
is_vae (bool, optional): Whether model is VAE or not. Defaults to True.
|
|
926
|
-
|
|
927
|
-
Returns:
|
|
928
|
-
callable: Function that calculates categorical crossentropy loss.
|
|
929
|
-
"""
|
|
930
|
-
|
|
931
|
-
@tf.function
|
|
932
|
-
def masked_binary_accuracy(y_true, y_pred, sample_weight=None):
|
|
933
|
-
"""Custom neural network metric function with missing mask.
|
|
934
|
-
|
|
935
|
-
Ignores missing data in the calculation of the loss function.
|
|
936
|
-
|
|
937
|
-
Args:
|
|
938
|
-
y_true (tensorflow.Tensor): Input multilabel encoded 3D tensor.
|
|
939
|
-
y_pred (tensorflow.Tensor): Predicted values from model.
|
|
940
|
-
sample_weight (numpy.ndarray): 2D matrix of sample weights.
|
|
941
|
-
|
|
942
|
-
Returns:
|
|
943
|
-
float: Binary accuracy calculated with missing data masked.
|
|
944
|
-
"""
|
|
945
|
-
return ba(
|
|
946
|
-
y_true,
|
|
947
|
-
y_pred,
|
|
948
|
-
sample_weight=sample_weight,
|
|
949
|
-
)
|
|
950
|
-
|
|
951
|
-
return masked_binary_accuracy
|
|
952
|
-
|
|
953
|
-
@staticmethod
|
|
954
|
-
def make_masked_binary_crossentropy(class_weight=None, is_vae=True):
|
|
955
|
-
"""Make binary crossentropy loss function with missing mask.
|
|
956
|
-
|
|
957
|
-
Args:
|
|
958
|
-
class_weight (Dict[int, float], optional): Class weights to reduce class imbalance. Defaults to None.
|
|
959
|
-
|
|
960
|
-
is_vae (bool, optional): Whether model is VAE or not. Defaults to True.
|
|
961
|
-
|
|
962
|
-
Returns:
|
|
963
|
-
callable: Function that calculates categorical crossentropy loss.
|
|
964
|
-
"""
|
|
965
|
-
|
|
966
|
-
@tf.function
|
|
967
|
-
def masked_binary_crossentropy(y_true, y_pred, sample_weight=None):
|
|
968
|
-
"""Custom loss function for with missing mask applied.
|
|
969
|
-
|
|
970
|
-
Ignores missing data in the calculation of the loss function.
|
|
971
|
-
|
|
972
|
-
Args:
|
|
973
|
-
y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
|
|
974
|
-
|
|
975
|
-
y_pred (tensorflow.tensor): Predicted values, should have undergone sigmoid activation.
|
|
976
|
-
|
|
977
|
-
sample_weight (numpy.ndarray): 2D matrix of sample weights.
|
|
978
|
-
|
|
979
|
-
Returns:
|
|
980
|
-
float: Binary crossentropy loss value.
|
|
981
|
-
"""
|
|
982
|
-
return bce(
|
|
983
|
-
y_true,
|
|
984
|
-
y_pred,
|
|
985
|
-
sample_weight=sample_weight,
|
|
986
|
-
)
|
|
987
|
-
|
|
988
|
-
return masked_binary_crossentropy
|
|
989
|
-
|
|
990
|
-
@staticmethod
|
|
991
|
-
def make_masked_categorical_accuracy():
|
|
992
|
-
"""Make categorical crossentropy loss function with missing mask.
|
|
993
|
-
|
|
994
|
-
Args:
|
|
995
|
-
class_weight (Dict[int, float): Weights for each class.
|
|
996
|
-
is_vae (bool, optional): Whether using VAE model. Defaults to False.
|
|
997
|
-
|
|
998
|
-
Returns:
|
|
999
|
-
callable: Function that calculates categorical crossentropy loss.
|
|
1000
|
-
"""
|
|
1001
|
-
|
|
1002
|
-
@tf.function
|
|
1003
|
-
def masked_categorical_accuracy(y_true, y_pred, sample_weight=None):
|
|
1004
|
-
"""Custom loss function for neural network model with missing mask.
|
|
1005
|
-
Ignores missing data in the calculation of the loss function.
|
|
1006
|
-
Args:
|
|
1007
|
-
y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
|
|
1008
|
-
y_pred (tensorflow.tensor): Predicted values.
|
|
1009
|
-
sample_weight (numpy.ndarray): 2D matrix of sample weights.
|
|
1010
|
-
|
|
1011
|
-
Returns:
|
|
1012
|
-
float: Mean squared error loss value with missing data masked.
|
|
1013
|
-
"""
|
|
1014
|
-
# # Mask out missing values.
|
|
1015
|
-
# y_true_masked = tf.boolean_mask(
|
|
1016
|
-
# y_true,
|
|
1017
|
-
# tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
|
|
1018
|
-
# )
|
|
1019
|
-
|
|
1020
|
-
# y_pred_masked = tf.boolean_mask(
|
|
1021
|
-
# y_pred,
|
|
1022
|
-
# tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
|
|
1023
|
-
# )
|
|
1024
|
-
|
|
1025
|
-
return cca(
|
|
1026
|
-
y_true,
|
|
1027
|
-
y_pred,
|
|
1028
|
-
sample_weight=sample_weight,
|
|
1029
|
-
)
|
|
1030
|
-
|
|
1031
|
-
return masked_categorical_accuracy
|
|
1032
|
-
|
|
1033
|
-
@staticmethod
|
|
1034
|
-
def make_masked_categorical_crossentropy():
|
|
1035
|
-
"""Make categorical crossentropy loss function with missing mask.
|
|
1036
|
-
|
|
1037
|
-
Returns:
|
|
1038
|
-
callable: Function that calculates categorical crossentropy loss.
|
|
1039
|
-
"""
|
|
1040
|
-
|
|
1041
|
-
@tf.function
|
|
1042
|
-
def masked_categorical_crossentropy(
|
|
1043
|
-
y_true, y_pred, sample_weight=None
|
|
1044
|
-
):
|
|
1045
|
-
"""Custom loss function for neural network model with missing mask.
|
|
1046
|
-
Ignores missing data in the calculation of the loss function.
|
|
1047
|
-
|
|
1048
|
-
Args:
|
|
1049
|
-
y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
|
|
1050
|
-
y_pred (tensorflow.tensor): Predicted values.
|
|
1051
|
-
sample_weight (numpy.ndarray): 2D matrix of sample weights.
|
|
1052
|
-
|
|
1053
|
-
Returns:
|
|
1054
|
-
float: Mean squared error loss value with missing data masked.
|
|
1055
|
-
"""
|
|
1056
|
-
# Mask out missing values.
|
|
1057
|
-
# y_true_masked = tf.boolean_mask(
|
|
1058
|
-
# y_true,
|
|
1059
|
-
# tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
|
|
1060
|
-
# )
|
|
1061
|
-
|
|
1062
|
-
# y_pred_masked = tf.boolean_mask(
|
|
1063
|
-
# y_pred,
|
|
1064
|
-
# tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
|
|
1065
|
-
# )
|
|
1066
|
-
|
|
1067
|
-
return cce(
|
|
1068
|
-
y_true,
|
|
1069
|
-
y_pred,
|
|
1070
|
-
sample_weight=sample_weight,
|
|
1071
|
-
)
|
|
1072
|
-
|
|
1073
|
-
return masked_categorical_crossentropy
|
|
1074
|
-
|
|
1075
|
-
@staticmethod
|
|
1076
|
-
def kl_divergence(z_mean, z_log_var, kl_weight=0.5):
|
|
1077
|
-
kl_loss = -0.5 * (
|
|
1078
|
-
1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
|
|
1079
|
-
)
|
|
1080
|
-
return tf.reduce_mean(tf.reduce_sum(kl_loss, axis=-1))
|
|
1081
|
-
|
|
1082
|
-
# Another way of doing it.
|
|
1083
|
-
# TODO: Test both ways.
|
|
1084
|
-
# z_sigma = tf.math.exp(0.5 * z_log_var)
|
|
1085
|
-
# return tf.reduce_sum(
|
|
1086
|
-
# tf.math.square(z_mean) + tf.math.square(z_sigma) - z_log_var - 1.0,
|
|
1087
|
-
# axis=-1,
|
|
1088
|
-
# )
|
|
1089
|
-
|
|
1090
|
-
def make_reconstruction_loss(self):
|
|
1091
|
-
"""Make loss function for use with a keras model.
|
|
1092
|
-
|
|
1093
|
-
Returns:
|
|
1094
|
-
callable: Function that calculates loss.
|
|
1095
|
-
"""
|
|
1096
|
-
|
|
1097
|
-
def reconstruction_loss(input_and_mask, y_pred):
|
|
1098
|
-
"""Custom loss function for neural network model with missing mask.
|
|
1099
|
-
|
|
1100
|
-
Ignores missing data in the calculation of the loss function.
|
|
1101
|
-
|
|
1102
|
-
Args:
|
|
1103
|
-
input_and_mask (numpy.ndarray): Input one-hot encoded array with missing values also one-hot encoded and h-stacked.
|
|
1104
|
-
|
|
1105
|
-
y_pred (numpy.ndarray): Predicted values.
|
|
1106
|
-
|
|
1107
|
-
Returns:
|
|
1108
|
-
float: Mean squared error loss value with missing data masked.
|
|
1109
|
-
"""
|
|
1110
|
-
n_features = y_pred.numpy().shape[1]
|
|
1111
|
-
|
|
1112
|
-
true_indices = range(n_features)
|
|
1113
|
-
missing_indices = range(n_features, n_features * 2)
|
|
1114
|
-
|
|
1115
|
-
# Split features and missing mask.
|
|
1116
|
-
y_true = tf.gather(input_and_mask, true_indices, axis=1)
|
|
1117
|
-
missing_mask = tf.gather(input_and_mask, missing_indices, axis=1)
|
|
1118
|
-
|
|
1119
|
-
observed_mask = tf.subtract(1.0, missing_mask)
|
|
1120
|
-
y_true_observed = tf.multiply(y_true, observed_mask)
|
|
1121
|
-
pred_observed = tf.multiply(y_pred, observed_mask)
|
|
1122
|
-
|
|
1123
|
-
# loss_fn = tf.keras.losses.CategoricalCrossentropy()
|
|
1124
|
-
# return loss_fn(y_true_observed, pred_observed)
|
|
1125
|
-
|
|
1126
|
-
return tf.keras.metrics.mean_squared_error(
|
|
1127
|
-
y_true=y_true_observed, y_pred=pred_observed
|
|
1128
|
-
)
|
|
1129
|
-
|
|
1130
|
-
return reconstruction_loss
|
|
1131
|
-
|
|
1132
|
-
@staticmethod
|
|
1133
|
-
def normalize_data(data):
|
|
1134
|
-
"""Normalize data between 0 and 1."""
|
|
1135
|
-
return (data - np.min(data)) / (np.max(data) - np.min(data))
|
|
1136
|
-
|
|
1137
|
-
@staticmethod
|
|
1138
|
-
def normalize_sum_to_1(d, target=1.0):
|
|
1139
|
-
factor = target / sum(d.values())
|
|
1140
|
-
return {k: v * factor for k, v in d.items()}
|
|
1141
|
-
|
|
1142
|
-
@staticmethod
|
|
1143
|
-
def smooth_weights(d, mu=0.15):
|
|
1144
|
-
total = np.sum(list(d.values()))
|
|
1145
|
-
keys = d.keys()
|
|
1146
|
-
class_weight = dict()
|
|
1147
|
-
|
|
1148
|
-
for k in keys:
|
|
1149
|
-
score = math.log(mu * total / float(d[k]))
|
|
1150
|
-
class_weight[k] = score if score > 1.0 else 1.0
|
|
1151
|
-
|
|
1152
|
-
return class_weight
|
|
1153
|
-
|
|
1154
|
-
@classmethod
|
|
1155
|
-
def get_class_weights(
|
|
1156
|
-
cls,
|
|
1157
|
-
y_true,
|
|
1158
|
-
original_missing_mask,
|
|
1159
|
-
user_weights=None,
|
|
1160
|
-
return_1d=False,
|
|
1161
|
-
method="auto",
|
|
1162
|
-
):
|
|
1163
|
-
"""Get class weights for each column in a 2D matrix.
|
|
1164
|
-
|
|
1165
|
-
Args:
|
|
1166
|
-
y_true (numpy.ndarray): True target values.
|
|
1167
|
-
|
|
1168
|
-
original_missing_mask (numpy.ndarray): Boolean mask with missing values set to True and non-missing to False.
|
|
1169
|
-
|
|
1170
|
-
user_weights (Dict[int, float], optional): Class weights if user-provided.
|
|
1171
|
-
|
|
1172
|
-
return_1d (bool, optional): If True, returns a dictionary of class weights, with integer encodings as keys and the corresponding class weights as keys. If False, returns 2D sample_weight matrix. Defaults to False.
|
|
1173
|
-
|
|
1174
|
-
Returns:
|
|
1175
|
-
numpy.ndarray or Dict[int, float]: Sample weights per column of shape (n_samples, n_features) if return_1d is False. Dictionary of class weights if True.
|
|
1176
|
-
"""
|
|
1177
|
-
# Get list of class_weights (per-column).
|
|
1178
|
-
class_weights = list()
|
|
1179
|
-
sample_weight = np.zeros(y_true.shape)
|
|
1180
|
-
if user_weights is not None:
|
|
1181
|
-
# Set user-defined sample_weights
|
|
1182
|
-
for k in user_weights.keys():
|
|
1183
|
-
sample_weight[y_true == k] = user_weights[k]
|
|
1184
|
-
|
|
1185
|
-
elif return_1d:
|
|
1186
|
-
y_true_1d = y_true.flatten()
|
|
1187
|
-
|
|
1188
|
-
if method == "auto":
|
|
1189
|
-
sample_weight = dict(
|
|
1190
|
-
zip(
|
|
1191
|
-
np.unique(y_true_1d),
|
|
1192
|
-
compute_class_weight(
|
|
1193
|
-
"balanced",
|
|
1194
|
-
classes=np.unique(y_true_1d),
|
|
1195
|
-
y=y_true_1d,
|
|
1196
|
-
),
|
|
1197
|
-
)
|
|
1198
|
-
)
|
|
1199
|
-
|
|
1200
|
-
elif method == "logsmooth":
|
|
1201
|
-
counts = np.unique(y_true_1d, return_counts=True)
|
|
1202
|
-
sample_weight = dict(zip(counts[0], counts[1]))
|
|
1203
|
-
sample_weight.pop(-9)
|
|
1204
|
-
sample_weight = cls.smooth_weights(sample_weight)
|
|
1205
|
-
sample_weight[-9] = 0.0
|
|
1206
|
-
|
|
1207
|
-
else:
|
|
1208
|
-
# Automatically get class weights to set sample_weight.
|
|
1209
|
-
for i in np.arange(y_true.shape[1]):
|
|
1210
|
-
mm = ~original_missing_mask[:, i]
|
|
1211
|
-
classes = np.unique(y_true[mm, i])
|
|
1212
|
-
cw = compute_class_weight(
|
|
1213
|
-
"balanced",
|
|
1214
|
-
classes=classes,
|
|
1215
|
-
y=y_true[mm, i],
|
|
1216
|
-
)
|
|
1217
|
-
|
|
1218
|
-
class_weights.append({k: v for k, v in zip(classes, cw)})
|
|
1219
|
-
|
|
1220
|
-
# Make sample_weight_matrix from automatic per-column class_weights.
|
|
1221
|
-
for i, w in enumerate(class_weights):
|
|
1222
|
-
for j in range(3):
|
|
1223
|
-
if j in w:
|
|
1224
|
-
sample_weight[y_true[:, i] == j, i] = w[j]
|
|
1225
|
-
|
|
1226
|
-
return sample_weight
|
|
1227
|
-
|
|
1228
|
-
@staticmethod
|
|
1229
|
-
def write_gt_state_probs(
|
|
1230
|
-
y_pred,
|
|
1231
|
-
y_pred_1d,
|
|
1232
|
-
y_true,
|
|
1233
|
-
y_true_1d,
|
|
1234
|
-
nn_method,
|
|
1235
|
-
sim_missing_mask,
|
|
1236
|
-
original_missing_mask,
|
|
1237
|
-
prefix="imputer",
|
|
1238
|
-
):
|
|
1239
|
-
bin_mapping = np.array(
|
|
1240
|
-
[np.array2string(x) for row in y_pred for x in row]
|
|
1241
|
-
)
|
|
1242
|
-
|
|
1243
|
-
bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
|
|
1244
|
-
|
|
1245
|
-
y_true_2d = np.reshape(y_true_1d, y_true.shape)
|
|
1246
|
-
bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
|
|
1247
|
-
y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
|
|
1248
|
-
|
|
1249
|
-
include = np.logical_and(sim_missing_mask, ~original_missing_mask)
|
|
1250
|
-
|
|
1251
|
-
gt_dist = list()
|
|
1252
|
-
colors = []
|
|
1253
|
-
for yt, yp, ypd, mask in zip(
|
|
1254
|
-
y_true_2d,
|
|
1255
|
-
bin_mapping_2d,
|
|
1256
|
-
y_pred_2d,
|
|
1257
|
-
include,
|
|
1258
|
-
):
|
|
1259
|
-
sites = dict()
|
|
1260
|
-
row_colors = []
|
|
1261
|
-
for i, (yt_site, mask_site) in enumerate(zip(yt, mask)):
|
|
1262
|
-
if mask_site:
|
|
1263
|
-
sites[
|
|
1264
|
-
f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
|
|
1265
|
-
] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
|
|
1266
|
-
if ypd[i] == yt_site:
|
|
1267
|
-
row_colors.append("blue")
|
|
1268
|
-
else:
|
|
1269
|
-
sites[
|
|
1270
|
-
f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
|
|
1271
|
-
] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
|
|
1272
|
-
row_colors.append("orange")
|
|
1273
|
-
else:
|
|
1274
|
-
sites[
|
|
1275
|
-
f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
|
|
1276
|
-
] = f"{i},{np.array2string(np.array([0.0, 0.0, 0.0]))},0,0"
|
|
1277
|
-
row_colors.append("gray")
|
|
1278
|
-
gt_dist.append(sites)
|
|
1279
|
-
colors.append(row_colors)
|
|
1280
|
-
|
|
1281
|
-
gt_df = pd.DataFrame.from_records(gt_dist)
|
|
1282
|
-
gt_df.to_csv(
|
|
1283
|
-
os.path.join(
|
|
1284
|
-
f"{prefix}_output",
|
|
1285
|
-
"logs",
|
|
1286
|
-
"Unsupervised",
|
|
1287
|
-
nn_method,
|
|
1288
|
-
"genotype_state_proba.csv",
|
|
1289
|
-
),
|
|
1290
|
-
index=False,
|
|
1291
|
-
header=False,
|
|
1292
|
-
)
|
|
1293
|
-
|
|
1294
|
-
# Reload the data
|
|
1295
|
-
|
|
1296
|
-
data = pd.read_csv(
|
|
1297
|
-
os.path.join(
|
|
1298
|
-
f"{prefix}_output",
|
|
1299
|
-
"logs",
|
|
1300
|
-
"Unsupervised",
|
|
1301
|
-
nn_method,
|
|
1302
|
-
"genotype_state_proba.csv",
|
|
1303
|
-
),
|
|
1304
|
-
header=None,
|
|
1305
|
-
)
|
|
1306
|
-
|
|
1307
|
-
# Parse the original data into separate dataframes for imputedGT and expectedGT
|
|
1308
|
-
imputedGT_data = data.applymap(lambda x: int(x.split(",")[2]))
|
|
1309
|
-
expectedGT_data = data.applymap(lambda x: int(x.split(",")[3]))
|
|
1310
|
-
|
|
1311
|
-
# Determine the binary mask based on whether imputedGT and expectedGT are the same
|
|
1312
|
-
|
|
1313
|
-
mask = imputedGT_data == expectedGT_data
|
|
1314
|
-
|
|
1315
|
-
# Create a new figure and set its size
|
|
1316
|
-
plt.figure(figsize=(12, 6))
|
|
1317
|
-
|
|
1318
|
-
from matplotlib.colors import ListedColormap
|
|
1319
|
-
|
|
1320
|
-
rgb_colors = sns.color_palette(
|
|
1321
|
-
[color for sublist in colors for color in sublist]
|
|
1322
|
-
)
|
|
1323
|
-
cmap = ListedColormap(rgb_colors)
|
|
1324
|
-
|
|
1325
|
-
# Create a heatmap
|
|
1326
|
-
sns.heatmap(mask, cmap=cmap, cbar=False)
|
|
1327
|
-
|
|
1328
|
-
# Set the title and labels
|
|
1329
|
-
plt.title("Expected Genotypes for Simulated Genotypes")
|
|
1330
|
-
plt.xlabel("Column Index")
|
|
1331
|
-
plt.ylabel("Row Index")
|
|
1332
|
-
|
|
1333
|
-
# Create a custom legend
|
|
1334
|
-
import matplotlib.patches as mpatches
|
|
1335
|
-
|
|
1336
|
-
green_patch = mpatches.Patch(color="blue", label="Agreement")
|
|
1337
|
-
orange_patch = mpatches.Patch(color="orange", label="Disagreement")
|
|
1338
|
-
gray_patch = mpatches.Patch(color="gray", label="Not Simulated")
|
|
1339
|
-
|
|
1340
|
-
plt.legend(
|
|
1341
|
-
handles=[green_patch, orange_patch, gray_patch], loc="lower right"
|
|
1342
|
-
)
|
|
1343
|
-
|
|
1344
|
-
outfile = os.path.join(
|
|
1345
|
-
f"{prefix}_output",
|
|
1346
|
-
"plots",
|
|
1347
|
-
"Unsupervised",
|
|
1348
|
-
nn_method,
|
|
1349
|
-
"gt_state_proba.png",
|
|
1350
|
-
)
|
|
1351
|
-
|
|
1352
|
-
plt.savefig(outfile, bbox_inches="tight", facecolor="white")
|
|
1353
|
-
|
|
1354
|
-
# @staticmethod
|
|
1355
|
-
# def write_gt_state_probs(
|
|
1356
|
-
# y_pred,
|
|
1357
|
-
# y_pred_1d,
|
|
1358
|
-
# y_true,
|
|
1359
|
-
# y_true_1d,
|
|
1360
|
-
# nn_method,
|
|
1361
|
-
# sim_missing_mask,
|
|
1362
|
-
# original_missing_mask,
|
|
1363
|
-
# prefix="imputer",
|
|
1364
|
-
# ):
|
|
1365
|
-
# bin_mapping = np.array(
|
|
1366
|
-
# [np.array2string(x) for row in y_pred for x in row]
|
|
1367
|
-
# )
|
|
1368
|
-
|
|
1369
|
-
# bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
|
|
1370
|
-
|
|
1371
|
-
# y_true_2d = np.reshape(y_true_1d, y_true.shape)
|
|
1372
|
-
# bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
|
|
1373
|
-
# y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
|
|
1374
|
-
|
|
1375
|
-
# gt_dist = list()
|
|
1376
|
-
# for yt, yp, ypd in zip(y_true_2d, bin_mapping_2d, y_pred_2d):
|
|
1377
|
-
# sites = dict()
|
|
1378
|
-
# for i, yt_site in enumerate(yt):
|
|
1379
|
-
# sites[
|
|
1380
|
-
# f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
|
|
1381
|
-
# ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
|
|
1382
|
-
# gt_dist.append(sites)
|
|
1383
|
-
|
|
1384
|
-
# gt_df = pd.DataFrame.from_records(gt_dist)
|
|
1385
|
-
# gt_df.to_csv(
|
|
1386
|
-
# os.path.join(
|
|
1387
|
-
# f"{prefix}_output",
|
|
1388
|
-
# "logs",
|
|
1389
|
-
# "Unsupervised",
|
|
1390
|
-
# nn_method,
|
|
1391
|
-
# "genotype_state_proba.csv",
|
|
1392
|
-
# ),
|
|
1393
|
-
# index=False,
|
|
1394
|
-
# header=False,
|
|
1395
|
-
# )
|