pg-sui 0.2.3__py3-none-any.whl → 1.6.16a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. pg_sui-1.6.16a3.dist-info/METADATA +292 -0
  2. pg_sui-1.6.16a3.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.3.dist-info → pg_sui-1.6.16a3.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.16a3.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.3.dist-info → pg_sui-1.6.16a3.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.16a3.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +922 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1436 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1121 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1361 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1666 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1660 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1316 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.3.dist-info/METADATA +0 -322
  83. pg_sui-0.2.3.dist-info/RECORD +0 -75
  84. pg_sui-0.2.3.dist-info/top_level.txt +0 -3
  85. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  88. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  89. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  90. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  93. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  94. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  95. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  96. pgsui/example_data/trees/test.iqtree +0 -376
  97. pgsui/example_data/trees/test.qmat +0 -5
  98. pgsui/example_data/trees/test.rate +0 -2033
  99. pgsui/example_data/trees/test.tre +0 -1
  100. pgsui/example_data/trees/test_n10.rate +0 -19
  101. pgsui/example_data/trees/test_n100.rate +0 -109
  102. pgsui/example_data/trees/test_n500.rate +0 -509
  103. pgsui/example_data/trees/test_siterates.txt +0 -2024
  104. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  105. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  106. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  107. pgsui/example_data/vcf_files/test.vcf +0 -244
  108. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  109. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  110. pgsui/impute/estimators.py +0 -1268
  111. pgsui/impute/impute.py +0 -1463
  112. pgsui/impute/simple_imputers.py +0 -1431
  113. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  114. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  115. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  116. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  117. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  118. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  119. pgsui/pg_sui.py +0 -261
  120. pgsui/utils/sequence_tools.py +0 -407
  121. simulation/sim_benchmarks.py +0 -333
  122. simulation/sim_treeparams.py +0 -475
  123. test/__init__.py +0 -0
  124. test/pg_sui_simtest.py +0 -215
  125. test/pg_sui_testing.py +0 -523
  126. test/test.py +0 -151
  127. test/test_pgsui.py +0 -374
  128. test/test_tkc.py +0 -185
@@ -1,1395 +0,0 @@
1
- import logging
2
- import math
3
- import os
4
- import sys
5
- import random
6
- import warnings
7
-
8
- import numpy as np
9
- import pandas as pd
10
- import seaborn as sns
11
- import matplotlib.pyplot as plt
12
-
13
- from sklearn.utils.class_weight import (
14
- compute_class_weight,
15
- )
16
-
17
- from sklearn.metrics import f1_score
18
-
19
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
20
- logging.getLogger("tensorflow").disabled = True
21
- warnings.filterwarnings("ignore", category=UserWarning)
22
-
23
- # noinspection PyPackageRequirements
24
- import tensorflow as tf
25
-
26
- # Disable can't find cuda .dll errors. Also turns of GPU support.
27
- tf.config.set_visible_devices([], "GPU")
28
-
29
- from tensorflow.python.util import deprecation
30
-
31
- # Disable warnings and info logs.
32
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
33
- tf.get_logger().setLevel(logging.ERROR)
34
-
35
-
36
- # Monkey patching deprecation utils to supress warnings.
37
- # noinspection PyUnusedLocal
38
- def deprecated(
39
- date, instructions, warn_once=True
40
- ): # pylint: disable=unused-argument
41
- def deprecated_wrapper(func):
42
- return func
43
-
44
- return deprecated_wrapper
45
-
46
-
47
- deprecation.deprecated = deprecated
48
-
49
-
50
- class DisabledCV:
51
- def __init__(self):
52
- self.n_splits = 1
53
-
54
- def split(self, X, y, groups=None):
55
- yield (np.arange(len(X)), np.arange(len(y)))
56
-
57
- def get_n_splits(self, X, y, groups=None):
58
- return self.n_splits
59
-
60
-
61
- # For VAE.
62
- # Necessary to initialize outside of class for use with tf.function decorator.
63
- cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
64
- cca = tf.keras.metrics.CategoricalAccuracy()
65
- ba = tf.keras.metrics.BinaryAccuracy()
66
- bce = tf.keras.losses.BinaryCrossentropy()
67
-
68
-
69
- class NeuralNetworkMethods:
70
- """Methods common to all neural network imputer classes and loss functions"""
71
-
72
- def __init__(self):
73
- self.data = None
74
-
75
- @staticmethod
76
- def encode_multilab(X, num_classes=4):
77
- """Encode 0-9 integer data in one-hot format.
78
- Args:
79
- X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
80
-
81
- num_classes (int, optional): Number of multi-label classes to use. Mostly for compatibility with encode_multiclass. Defaults to 4.
82
- Returns:
83
- pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan). multi-label categories will be encoded as 0.5. Otherwise, it will be 1.0.
84
- """
85
- # return np.where(X >= 0.5, 1.0, 0.0)
86
- try:
87
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], 4))
88
- except IndexError:
89
- Xt = np.zeros(shape=(X.shape[0],))
90
-
91
- mappings = {
92
- 0: [1.0, 0.0, 0.0, 0.0],
93
- 1: [0.0, 1.0, 0.0, 0.0],
94
- 2: [0.0, 0.0, 1.0, 0.0],
95
- 3: [0.0, 0.0, 0.0, 1.0],
96
- 4: [1.0, 1.0, 0.0, 0.0],
97
- 5: [1.0, 0.0, 1.0, 0.0],
98
- 6: [1.0, 0.0, 0.0, 1.0],
99
- 7: [0.0, 1.0, 1.0, 0.0],
100
- 8: [0.0, 1.0, 0.0, 1.0],
101
- 9: [0.0, 0.0, 1.0, 1.0],
102
- -9: [np.nan, np.nan, np.nan, np.nan],
103
- }
104
- try:
105
- for row in np.arange(X.shape[0]):
106
- Xt[row] = [mappings[enc] for enc in X[row]]
107
- except TypeError:
108
- Xt = [mappings[enc] for enc in X]
109
-
110
- if not isinstance(Xt, np.ndarray):
111
- Xt = np.array(Xt)
112
- return Xt
113
-
114
- @staticmethod
115
- def encode_multiclass(X, num_classes=10, missing_value=-9):
116
- """Encode 0-9 integer data in multi-class one-hot format.
117
-
118
- Missing values get encoded as ``[np.nan] * num_classes``
119
- Args:
120
- X (numpy.ndarray): Input array with 012-encoded data and ``missing_value`` as the missing data value.
121
-
122
- num_classes (int, optional): Number of classes to use. Defaults to 10.
123
-
124
- missing_value (int, optional): Missing data value to replace with ``[np.nan] * num_classes``\. Defaults to -9.
125
- Returns:
126
- pandas.DataFrame: Multi-class one-hot encoded data, ignoring missing values (np.nan).
127
- """
128
- int_cats, ohe_arr = np.arange(num_classes), np.eye(num_classes)
129
- mappings = dict(zip(int_cats, ohe_arr))
130
- mappings[missing_value] = np.array([np.nan] * num_classes)
131
-
132
- try:
133
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], num_classes))
134
- except IndexError:
135
- Xt = np.zeros(shape=(X.shape[0],))
136
-
137
- try:
138
- for row in np.arange(X.shape[0]):
139
- Xt[row] = [mappings[enc] for enc in X[row]]
140
- except TypeError:
141
- Xt = [mappings[enc] for enc in X]
142
-
143
- if not isinstance(Xt, np.ndarray):
144
- Xt = np.array(Xt)
145
-
146
- return Xt
147
-
148
- @classmethod
149
- def decode_masked(
150
- cls,
151
- y_true_bin,
152
- y_pred_proba,
153
- is_multiclass=True,
154
- return_proba=False,
155
- return_multilab=False,
156
- return_int=True,
157
- predict_still_missing=True,
158
- threshold_increment=0.01,
159
- multilabel_averaging="macro",
160
- missing_mask=None,
161
- ):
162
- """Evaluate model predictions by decoding from one-hot encoding to integer-encoded format.
163
-
164
- Gets the index of the highest predicted value to obtain the integer encodings or integer encodings.
165
-
166
- Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
167
-
168
- Args:
169
- y_true_bin (numpy.ndarray): True multilabel target values of shape (n_samples * n_features, num_classes). Array should be flattened and masked.
170
-
171
- y_pred_proba (numpy.ndarray): Multilabel model predictions of shape (n_samples * n_features, num_classes). Array should be flattened and masked.
172
-
173
- is_multiclass (bool, optional): True if using multiclass data with softmax activation. False if using multilabel data with sigmoid activation. Defaults to True.
174
-
175
- threshold (float, optional): If using multilabel, then set the threshold for determining 1 or 0 predictions. Defaults to 0.5.
176
-
177
- return_proba (bool, optional): If True, returns probabilities for unresolved values where all multilabel probabilities were below the threshold. Defaults to False.
178
-
179
- return_multilab (bool, optional): If True, returns the multilabel encodings instead of integer encodings (if doing multilabel classification). Defaults to False.
180
-
181
- return_int (bool, optional): If True, returns the integer encodings instead of onehot encodings (if doing multiclass classification). Defaults to False.
182
-
183
- predict_still_missing (bool, optional): If True, values that are still missing after decoding are decoded using the maximum probability (i.e., with np.argmax). If False, then it is possible that some missing data might still remain after decoding if none of the multilabel probabilities are above the threshold. Defaults to True.
184
-
185
- threshold_increment (float, optional): How much to increment threshold when searching for optimal threshold. Should be > 0 and < 1. Defaults to 0.05.
186
-
187
- multilabel_averaging (str): Method to use for averaging F1 score among multilabel classes. Supported options are: {"macro", "micro", "weighted", "samples"}. Defaults to "macro".
188
-
189
- missing_mask (numpy.ndarray, optional): Missing mask with missing values encoded as 1's and nonmissing as 0. Only used if not None. Defaults to None.
190
-
191
- Returns:
192
- numpy.ndarray: Imputed integer-encoded values.
193
-
194
- numpy.ndarray (optional): Probabilities for each call, with those above the threshold set to 1.0 and those below the threshold between 0 and 1.
195
- """
196
-
197
- if return_int and return_multilab:
198
- raise ValueError(
199
- "return_int and return_multilab cannot both be True."
200
- )
201
-
202
- y_unresolved_certainty = None
203
- if is_multiclass or y_true_bin.shape[-1] == 10:
204
- # Softmax predictions.
205
- # If reduce_dim is True, will return integer encodings.
206
- # Otherwise, returns one-hot encodings.
207
- y_pred = cls.decode_multiclass(y_pred_proba, reduce_dim=return_int)
208
- else:
209
- # Onehot encode if not already one-hot encoded.
210
- if y_true_bin.shape[-1] != 4:
211
- if y_true_bin.shape[-1] != 10:
212
- y_true_bin = cls.encode_multilab(y_true_bin)
213
- else:
214
- y_true_bin = cls.encode_multiclass(y_true_bin)
215
-
216
- pred_multilab = cls.zero_extra_categories(y_pred_proba)
217
-
218
- # Binary multilabel predictions.
219
- threshold = cls.get_optimal_threshold(
220
- y_true_bin,
221
- pred_multilab,
222
- increment=threshold_increment,
223
- average_method=multilabel_averaging,
224
- )
225
-
226
- # Call 0s and 1s based on threshold.
227
- pred_multilab = np.where(pred_multilab >= threshold, 1.0, 0.0)
228
-
229
- pred_multilab_decoded = cls.decode_binary_multilab(pred_multilab)
230
-
231
- if predict_still_missing:
232
- # Check if there are still any missing values.
233
- still_missing = np.all(pred_multilab == 0, axis=-1)
234
-
235
- if return_multilab:
236
- still_missing_bin = np.all(
237
- pred_multilab == 0, axis=-1, keepdims=True
238
- )
239
-
240
- # Do multiclass prediction with argmax then get the probabilities
241
- # if any unresolved values.
242
- if np.any(still_missing):
243
- # Get the argmax with the highest probability if
244
- # all classes are below threshold.
245
- y_multi = cls.decode_multiclass(y_pred_proba)
246
- y_multi_bin = cls.decode_multiclass(
247
- y_pred_proba, reduce_dim=False
248
- )
249
-
250
- try:
251
- y_pred = np.where(
252
- still_missing, y_multi, pred_multilab_decoded
253
- )
254
- except ValueError:
255
- y_pred = np.where(
256
- still_missing,
257
- y_multi,
258
- np.reshape(
259
- pred_multilab_decoded, still_missing.shape
260
- ),
261
- )
262
-
263
- if return_multilab:
264
- y_pred_bin = np.where(
265
- still_missing_bin, y_multi_bin, pred_multilab
266
- )
267
-
268
- y_pred = y_pred_bin
269
-
270
- if return_proba:
271
- # Get max value as base call.
272
- y_pred_proba_max = y_pred_proba.max(axis=-1)
273
-
274
- # Get probability of max value that was < threshold.
275
- y_unresolved_certainty = np.where(
276
- still_missing, y_pred_proba_max, 1.0
277
- )
278
-
279
- else:
280
- if return_multilab:
281
- y_pred = pred_multilab
282
- else:
283
- y_pred = pred_multilab_decoded
284
- else:
285
- if return_multilab:
286
- y_pred = pred_multilab
287
- else:
288
- y_pred = pred_multilab_decoded
289
-
290
- y_pred = y_pred.astype(int)
291
-
292
- if return_proba:
293
- return y_pred, y_unresolved_certainty
294
- else:
295
- return y_pred
296
-
297
- @classmethod
298
- def get_optimal_threshold(
299
- cls,
300
- y_true_bin,
301
- y_pred_proba,
302
- increment=0.01,
303
- average_method="macro",
304
- ):
305
- """Increment to find the optimal decoding threshold.
306
-
307
- Args:
308
- y_true_bin (numpy.ndarray): True multilabel values of shape (n_samples * n_features, num_classes).
309
-
310
- y_pred_proba (numpy.ndarray): Multilabel prediction probabilities of shape (n_features * n_samples, num_classes).
311
-
312
- increment (float, optional): How much to increment when searching for optimal threshold. Should be > 0 and < 1. Defaults to 0.1.
313
-
314
- average_method (str, optional): Method to use for averaging the F1 score across multilabel classes. Possible options include {"macro", "micro", "weighted", "samples"}. Defaults to "macro".
315
-
316
- Returns:
317
- float: Optimal decoding threshold.
318
- """
319
- y_true = y_true_bin.copy()
320
- y_pred = y_pred_proba.copy()
321
-
322
- thresholds = np.arange(increment, 1, increment)
323
-
324
- nonmissing_mask = np.where(y_true_bin != -1)
325
- num_classes = y_true_bin.shape[-1]
326
-
327
- # This is only supposed to get applied during the final transform,
328
- # when the original missing data is replaced with predictions.
329
- # If this isn't done here, it ends up having -1 values in it,
330
- # which causes the f1_score function to throw an error.
331
-
332
- try:
333
- y_true = y_true[nonmissing_mask]
334
- y_pred = y_pred[nonmissing_mask]
335
- except IndexError:
336
- pass
337
-
338
- # Call 0s and 1s based on threshold.
339
-
340
- scores = list()
341
- for t in thresholds:
342
- pred_multilab = np.where(y_pred >= t, 1.0, 0.0)
343
- pred_multilab_decoded = cls.decode_binary_multilab(pred_multilab)
344
- true_multilab_decoded = cls.decode_binary_multilab(y_true)
345
-
346
- # Had to cast them as integers to get rid of a type error during the
347
- # final transform() function.
348
-
349
- scores.append(
350
- f1_score(
351
- true_multilab_decoded,
352
- pred_multilab_decoded,
353
- average="weighted",
354
- )
355
- )
356
-
357
- return thresholds[np.argmax(scores)]
358
-
359
- @classmethod
360
- def flatten_bin_encodings(cls, y):
361
- """Flatten first two dimensions of binary encodings to (num_samples * num_features, num_classes).
362
-
363
- Args:
364
- y (numpy.ndarray): Numpy array with 3-dimensional shape of (n_samples, num_features, num_classes).
365
-
366
- Returns:
367
- numpy.ndarray: Array of shape (n_samples * num_features, num_classes).
368
-
369
- Raises:
370
- ValueError: Input shape must be 3-dimensional.
371
- """
372
- if len(y.shape) != 3:
373
- raise ValueError("Input array must be 3-dimensional")
374
-
375
- return y.reshape(y.shape[0] * y.shape[1], y.shape[2])
376
-
377
- @staticmethod
378
- def zero_extra_categories(y_pred_proba, threshold=0.5):
379
- """Check if any prediction probabilities have >2 values above threshold.
380
-
381
- If >2, then it sets the two with the lowest probabilities to 0.0.
382
-
383
- Args:
384
- y_pred_proba (numpy.ndarray): Prediction probabilities (sigmoid activation) of shape (n_samples, n_features, num_classes) or (n_samples * n_features, num_classes).
385
-
386
- pred_multilab (numpy.ndarray): Multi-label decodings. Inner arrays should have only 0s and 1s. Should be of shape (n_samples, n_features, num_classes) or (n_samples * n_features, num_classes).
387
-
388
- threshold (float, optional): Threshold to use to set decoded multilabel values to 0s (< threshold) or 1s (>= threshold). Defaults to 0.5.
389
- """
390
- N = 2
391
- y_pred_proba[y_pred_proba.argsort().argsort() < N] = 0.0
392
- return y_pred_proba
393
- # idx = np.argpartition(y_pred_proba.ravel(), k)
394
- # indices = tuple(
395
- # np.array(np.unravel_index(idx, y_pred_proba.shape))[
396
- # :, range(min(k, 0), max(k, 0))
397
- # ]
398
- # )
399
-
400
- # y_pred_proba[indices] = 0.0
401
- # return y_pred_proba
402
-
403
- # return np.where(y_pred_proba >= threshold, 1.0, 0.0)
404
-
405
- @classmethod
406
- def decode_multiclass(cls, y_pred_proba, reduce_dim=True):
407
- """Decode probabilities to either one-hot or integer encodings.
408
-
409
- Args:
410
- y_pred_proba (numpy.ndarray): Probabilities to decode.
411
-
412
- reduce_dim (bool, optional): If True, returns integer encodings of one fewer dimension than ``y_pred_proba``\. Otherwise, returns one-hot encodings where the class with the maximum probability is a 1 and every other class is 0. Defaults to True.
413
-
414
- Returns:
415
- numpy.ndarray: Integer or one-hot-encoded predictions.
416
- """
417
- yt = np.apply_along_axis(cls.mle, axis=-1, arr=y_pred_proba)
418
- if reduce_dim:
419
- return np.argmax(yt, axis=-1)
420
- else:
421
- return yt
422
-
423
- @classmethod
424
- def decode_binary_multilab(cls, y_pred):
425
- """Decode multi-label sigmoid probabilities to integer encodings.
426
-
427
- The predictions should have already undergone sigmoid activation and should be probabilities.
428
-
429
- If sigmoid activation output is >0.5, gets encoded as 1.0; else 0.0. If more than one category is > 0.5, then it is a heterozygote.
430
-
431
- Args:
432
- y_pred (numpy.ndarray): Model predictions of shape (n_samples * n_features, num_classes) or (n_samples, n_features, num_classes). A threshold should already have been applied to set each class to 0 or 1.
433
-
434
- Returns:
435
- numpy.ndarray: Integer-decoded multilabel predictions of shape (n_samples * n_features) or (n_samples, n_features).
436
- """
437
- y_pred_idx = y_pred.astype(int)
438
- y_pred_idx = y_pred_idx.astype(str)
439
-
440
- if len(y_pred_idx.shape) < 3:
441
- y_pred_idx = np.array(
442
- [
443
- "".join(np.atleast_1d(row == "1").nonzero()[0].astype(str))
444
- for row in y_pred_idx
445
- ]
446
- )
447
- else:
448
- y_pred_idx = np.array(
449
- [
450
- "".join(np.atleast_1d(col == "1").nonzero()[0].astype(str))
451
- for row in y_pred_idx
452
- for col in row
453
- ]
454
- )
455
-
456
- try:
457
- Xt = np.zeros(shape=(y_pred.shape[0], y_pred.shape[1], 4))
458
- except IndexError:
459
- Xt = np.zeros(shape=(y_pred.shape[0],))
460
-
461
- mappings = {
462
- "0": 0,
463
- "1": 1,
464
- "2": 2,
465
- "3": 3,
466
- "01": 4,
467
- "02": 5,
468
- "03": 6,
469
- "12": 7,
470
- "13": 8,
471
- "23": 9,
472
- "-9": -9,
473
- "": -9,
474
- }
475
-
476
- Xt = [mappings[enc] for enc in y_pred_idx]
477
-
478
- if not isinstance(Xt, np.ndarray):
479
- Xt = np.array(Xt)
480
- return Xt
481
-
482
- @staticmethod
483
- def encode_categorical(X):
484
- """Encode -9 encoded missing values as np.nan.
485
-
486
- Args:
487
- X (numpy.ndarray): 012-encoded genotypes with -9 as missing values.
488
-
489
- Returns:
490
- pandas.DataFrame: DataFrame with missing values encoded as np.nan.
491
- """
492
- np.nan_to_num(X, copy=False, nan=-9.0)
493
- X = X.astype(str)
494
- X[(X == "-9.0") | (X == "-9")] = "none"
495
-
496
- df = pd.DataFrame(X)
497
- df_incomplete = df.copy()
498
-
499
- # Replace 'none' with np.nan
500
- for row in df.index:
501
- for col in df.columns:
502
- if df_incomplete.iat[row, col] == "none":
503
- df_incomplete.iat[row, col] = np.nan
504
-
505
- return df_incomplete
506
-
507
- @staticmethod
508
- def mle(row):
509
- """Get the Maximum Likelihood Estimation for the best prediction. Basically, it sets the index of the maxiumum value in a vector (row) to 1.0, since it is one-hot encoded.
510
-
511
- Args:
512
- row (numpy.ndarray(float)): Row vector with predicted values as floating points.
513
-
514
- Returns:
515
- numpy.ndarray(float): Row vector with the highest prediction set to 1.0 and the others set to 0.0.
516
- """
517
- res = np.zeros(row.shape[0])
518
- res[np.argmax(row)] = 1
519
- return res
520
-
521
- @classmethod
522
- def predict(cls, X, complete_encoded):
523
- """Evaluate VAE predictions by calculating the highest predicted value.
524
-
525
- Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
526
-
527
- Args:
528
- X (numpy.ndarray): Input 012-encoded data.
529
-
530
- complete_encoded (numpy.ndarray): Output one-hot encoded data with the maximum predicted values for each class set to 1.0.
531
-
532
- Returns:
533
- numpy.ndarray: Imputed one-hot encoded values.
534
-
535
- pandas.DataFrame: One-hot encoded pandas DataFrame with no missing values.
536
- """
537
-
538
- df = cls.encode_categorical(X)
539
-
540
- # Had to add dropna() to count unique classes while ignoring np.nan
541
- col_classes = [len(df[c].dropna().unique()) for c in df.columns]
542
- df_dummies = pd.get_dummies(df)
543
- mle_complete = None
544
- for i, cnt in enumerate(col_classes):
545
- start_idx = int(sum(col_classes[0:i]))
546
- col_completed = complete_encoded[:, start_idx : start_idx + cnt]
547
- mle_completed = np.apply_along_axis(
548
- cls.mle, axis=1, arr=col_completed
549
- )
550
-
551
- if mle_complete is None:
552
- mle_complete = mle_completed
553
-
554
- else:
555
- mle_complete = np.hstack([mle_complete, mle_completed])
556
- return mle_complete, df_dummies
557
-
558
- def validate_hidden_layers(self, hidden_layer_sizes, num_hidden_layers):
559
- """Validate hidden_layer_sizes and verify that it is in the correct format.
560
-
561
- Args:
562
- hidden_layer_sizes (str, int, List[str], or List[int]): Output units for all the hidden layers.
563
-
564
- num_hidden_layers (int): Number of hidden layers to use.
565
-
566
- Returns:
567
- List[int] or List[str]: List of hidden layer sizes.
568
- """
569
- if isinstance(hidden_layer_sizes, (str, int)):
570
- hidden_layer_sizes = [hidden_layer_sizes] * num_hidden_layers
571
-
572
- # If not all integers
573
- elif isinstance(hidden_layer_sizes, list):
574
- if not all(
575
- [isinstance(x, (str, int)) for x in hidden_layer_sizes]
576
- ):
577
- ls = list(set([type(item) for item in hidden_layer_sizes]))
578
- raise TypeError(
579
- f"Variable hidden_layer_sizes must either be None, "
580
- f"an integer or string, or a list of integers or "
581
- f"strings, but got the following type(s): {ls}"
582
- )
583
-
584
- else:
585
- raise TypeError(
586
- f"Variable hidden_layer_sizes must either be, "
587
- f"an integer, a string, or a list of integers or strings, "
588
- f"but got the following type: {type(hidden_layer_sizes)}"
589
- )
590
-
591
- assert (
592
- num_hidden_layers == len(hidden_layer_sizes)
593
- and num_hidden_layers > 0
594
- ), "num_hidden_layers must be the length of hidden_layer_sizes."
595
-
596
- return hidden_layer_sizes
597
-
598
- def get_hidden_layer_sizes(self, n_dims, n_components, hl_func, vae=False):
599
- """Get dimensions of hidden layers.
600
-
601
- Args:
602
- n_dims (int): The number of feature dimensions (columns) (d).
603
-
604
- n_components (int): The number of reduced dimensions (t).
605
-
606
- hl_func (str): The function to use to calculate the hidden layer sizes. Possible options: "midpoint", "sqrt", "log2".
607
-
608
- vae (bool, optional): Whether using the VAE algorithm. If False, then the returned list gets reversed for NLPCA and UBP.
609
-
610
- Returns:
611
- [int, int, int, ...]: [Number of dimensions in hidden layers].
612
-
613
- Raises:
614
- ValueError: Too many hidden layers specified. Repeated reduction of layer sizes dips below n_components.
615
- """
616
- layers = list()
617
- if not isinstance(hl_func, list):
618
- raise TypeError(
619
- f"hl_func must be of type list, but got {type(hl_func)}."
620
- )
621
-
622
- units = n_dims
623
- for func in hl_func:
624
- if func == "midpoint":
625
- units = round((units + n_components) / 2)
626
- elif func == "sqrt":
627
- units = round(math.sqrt(units))
628
- elif func == "log2":
629
- units = round(math.log(units, 2))
630
- elif isinstance(func, int):
631
- units = func
632
- else:
633
- raise ValueError(
634
- f"hidden_layer_sizes must be either integers or any of "
635
- f"the following strings: 'midpoint', "
636
- f"'sqrt', or 'log2', but got {func} of type {type(func)}"
637
- )
638
-
639
- if units <= n_components:
640
- print(
641
- f"WARNING: hidden_layer_size reduction became less than n_components. Using only {len(layers)} hidden layers."
642
- )
643
- break
644
-
645
- assert units > 0 and units < n_dims, (
646
- f"The hidden layer sizes must be > 0 and < the number of "
647
- f"features (i.e., columns) in the dataset, but size was {units}"
648
- )
649
-
650
- layers.append(units)
651
-
652
- assert (
653
- layers
654
- ), "There was an error setting hidden layer sizes. Size list is empty. It is possible that the first 'sqrt' reduction caused units to be <= n_components."
655
-
656
- if not vae:
657
- layers.reverse()
658
-
659
- return layers
660
-
661
- def validate_model_inputs(self, y, missing_mask, output_shape):
662
- """Validate inputs to Keras subclass model.
663
-
664
- Args:
665
- V (numpy.ndarray): Input to refine. Shape: (n_samples, n_components).
666
- y (numpy.ndarray): Target (but actual input data). Shape: (n_samples, n_features).
667
-
668
- y_test (numpy.ndarray): Target test dataset. Should have been imputed with simple imputer and missing data simulated using SimGenotypeData(). Shape: (n_samples, n_features).
669
-
670
- missing_mask (numpy.ndarray): Missing data mask for y.
671
-
672
- missing_mask_test (numpy.ndarray): Missing data mask for y_test.
673
-
674
- output_shape (int): Output shape for hidden layers.
675
-
676
- Raises:
677
- TypeError: V, y, missing_mask, output_shape must not be NoneType.
678
- """
679
- if y is None:
680
- raise TypeError("y must not be NoneType.")
681
-
682
- if missing_mask is None:
683
- raise TypeError("missing_mask must not be NoneType.")
684
-
685
- if output_shape is None:
686
- raise TypeError("output_shape must not be NoneType.")
687
-
688
- def prepare_training_batches(
689
- self,
690
- V,
691
- y,
692
- batch_size,
693
- batch_idx,
694
- trainable,
695
- n_components,
696
- sample_weight,
697
- missing_mask,
698
- ubp=True,
699
- ):
700
- """Prepare training batches in the custom training loop.
701
-
702
- Args:
703
- V (numpy.ndarray): Input to batch subset and refine, of shape (n_samples, n_components) (if doing UBP/NLPCA) or (n_samples, n_features) (if doing VAE).
704
-
705
- y (numpy.ndarray): Target to use to refine input V. shape (n_samples, n_features).
706
-
707
- batch_size (int): Batch size to subset.
708
-
709
- batch_idx (int): Current batch index.
710
-
711
- trainable (bool): Whether tensor v should be trainable.
712
-
713
- n_components (int): Number of principal components used in V.
714
-
715
- sample_weight (List[float] or None): List of floats of shape (n_samples,) with sample weights. sample_weight argument must be passed to fit().
716
-
717
- missing_mask (numpy.ndarray): Boolean array with True for missing values and False for observed values.
718
-
719
- ubp (bool, optional): Whether model is UBP/NLPCA (if True) or VAE (if False). Defaults to True.
720
-
721
- Returns:
722
- tf.Variable: Input tensor v with current batch assigned.
723
- numpy.ndarray: Current batch of target data (actual input) used to refine v.
724
- List[float]: Sample weights
725
- int: Batch starting index.
726
- int: Batch ending index.
727
- numpy.ndarray: Batch of y_train target data of shape (batch_size, n_features, n_classes). Only returned for VAE.
728
- """
729
- # on_train_batch_begin() method.
730
- n_samples = y.shape[0]
731
-
732
- # Get current batch size and range.
733
- # self._batch_idx is set in the UBPCallbacks() callback
734
- batch_start = batch_idx * batch_size
735
- batch_end = (batch_idx + 1) * batch_size
736
- if batch_end > n_samples:
737
- batch_end = n_samples - 1
738
- batch_size = batch_end - batch_start
739
-
740
- if ubp:
741
- # override batches. This model refines the input to fit the output, so
742
- # v_batch and y_true have to be overridden.
743
- y_true = y[batch_start:batch_end, :]
744
-
745
- v_batch = V[batch_start:batch_end, :]
746
- missing_mask_batch = missing_mask[batch_start:batch_end, :]
747
-
748
- if sample_weight is not None:
749
- sample_weight_batch = sample_weight[batch_start:batch_end, :]
750
- else:
751
- sample_weight_batch = None
752
-
753
- v = tf.Variable(
754
- tf.zeros([batch_size, n_components]),
755
- trainable=trainable,
756
- dtype=tf.float32,
757
- )
758
-
759
- # Assign current batch to tf.Variable v.
760
- v.assign(v_batch)
761
-
762
- return (
763
- v,
764
- y_true,
765
- sample_weight_batch,
766
- missing_mask_batch,
767
- batch_start,
768
- batch_end,
769
- )
770
-
771
- else:
772
- # Using VAE.
773
- y_true = y[batch_start:batch_end, :]
774
- v = V[batch_start:batch_end, :]
775
- missing_mask_batch = missing_mask[batch_start:batch_end, :]
776
-
777
- if sample_weight is not None:
778
- sample_weight_batch = sample_weight[batch_start:batch_end, :]
779
- else:
780
- sample_weight_batch = None
781
-
782
- return (
783
- y_true,
784
- sample_weight_batch,
785
- missing_mask_batch,
786
- )
787
-
788
- def validate_batch_size(self, X, batch_size):
789
- """Validate the batch size, and adjust as necessary.
790
-
791
- If the specified batch_size is greater than the number of samples in the input data, it will divide batch_size by 2 until it is less than n_samples.
792
-
793
- Args:
794
- X (numpy.ndarray): Input data of shape (n_samples, n_features).
795
- batch_size (int): Batch size to use.
796
-
797
- Returns:
798
- int: Batch size (adjusted if necessary).
799
- """
800
- if batch_size > X.shape[0]:
801
- while batch_size > X.shape[0]:
802
- print(
803
- "Batch size is larger than the number of samples. "
804
- "Dividing batch_size by 2."
805
- )
806
- batch_size //= 2
807
- return batch_size
808
-
809
- def set_compile_params(
810
- self, optimizer, sample_weights=None, vae=False, act_func="softmax"
811
- ):
812
- """Set compile parameters to use.
813
-
814
- Args:
815
- optimizer (str): Keras optimizer to use. Possible options include: {"adam", "sgd", "adagrad", "adadelta", "adamax", "ftrl", "nadam", "rmsprop"}.
816
-
817
- sample_weights (numpy.ndarray, optional): Sample weight matrix of shape (n_samples, n_features). Defaults to None.
818
-
819
- vae (bool, optional): Whether using the VAE model. Defaults to False.
820
-
821
- act_func (str, optional): Activation function to use. Should be "softmax" if doing multiclass classification, otherwise "sigmoid".
822
-
823
- Returns:
824
- Dict[str, callable] or Dict[str, Any]: Callables if search_mode is True, otherwise instantiated objects.
825
-
826
- Raises:
827
- ValueError: Unsupported optimizer specified.
828
- ValueError: Invalid act_func argument supplied.
829
- """
830
- if optimizer.lower() == "adam":
831
- opt = tf.keras.optimizers.legacy.Adam
832
- elif optimizer.lower() == "sgd":
833
- opt = tf.keras.optimizers.legacy.SGD
834
- elif optimizer.lower() == "adagrad":
835
- opt = tf.keras.optimizers.legacy.Adagrad
836
- elif optimizer.lower() == "adadelta":
837
- opt = tf.keras.optimizers.legacy.Adadelta
838
- elif optimizer.lower() == "adamax":
839
- opt = tf.keras.optimizers.legacy.Adamax
840
- elif optimizer.lower() == "ftrl":
841
- opt = tf.keras.optimizers.legacy.Ftrl
842
- elif optimizer.lower() == "nadam":
843
- opt = tf.keras.optimizers.legacy.Nadam
844
- elif optimizer.lower() == "rmsprop":
845
- opt = tf.keras.optimizers.legacy.RMSProp
846
-
847
- if vae:
848
- if act_func == "softmax":
849
- loss_func = (
850
- NeuralNetworkMethods.make_masked_categorical_crossentropy
851
- )
852
- elif act_func == "sigmoid":
853
- loss_func = (
854
- NeuralNetworkMethods.make_masked_binary_crossentropy
855
- )
856
- else:
857
- raise ValueError(
858
- f"act_func must be either 'softmax' or 'sigmoid', but got {act_func}"
859
- )
860
-
861
- loss = loss_func()
862
- metrics = None
863
-
864
- else:
865
- # Doing grid search. Params are callables.
866
- loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
867
- metrics = [tf.keras.metrics.CategoricalAccuracy()]
868
-
869
- return {
870
- "optimizer": opt,
871
- "loss": loss,
872
- "metrics": metrics,
873
- "run_eagerly": False,
874
- }
875
-
876
- @staticmethod
877
- def init_weights(dim1, dim2, w_mean=0, w_stddev=0.01):
878
- """Initialize random weights to use with the model.
879
-
880
- Args:
881
- dim1 (int): Size of first dimension.
882
-
883
- dim2 (int): Size of second dimension.
884
-
885
- w_mean (float, optional): Mean of normal distribution. Defaults to 0.
886
-
887
- w_stddev (float, optional): Standard deviation of normal distribution. Defaults to 0.01.
888
- """
889
- # Get reduced-dimension dataset.
890
- return np.random.normal(loc=w_mean, scale=w_stddev, size=(dim1, dim2))
891
-
892
- def reset_seeds(self):
893
- """Reset random seeds for initializing weights."""
894
- seed1 = np.random.randint(1, 1e6)
895
- seed2 = np.random.randint(1, 1e6)
896
- seed3 = np.random.randint(1, 1e6)
897
- np.random.seed(seed1)
898
- random.seed(seed2)
899
- if tf.__version__[0] == "2":
900
- tf.random.set_seed(seed3)
901
- else:
902
- tf.set_random_seed(seed3)
903
-
904
- @staticmethod
905
- def masked_mse(self, X_true, X_pred, mask):
906
- """Calculates mean squared error with missing values ignored.
907
-
908
- Args:
909
- X_true (numpy.ndarray): One-hot encoded input data.
910
- X_pred (numpy.ndarray): Predicted values.
911
- mask (numpy.ndarray): One-hot encoded missing data mask.
912
-
913
- Returns:
914
- float: Mean squared error calculation.
915
- """
916
- return np.square(np.subtract(X_true[mask], X_pred[mask])).mean()
917
-
918
- @staticmethod
919
- def make_masked_binary_accuracy(class_weight=None, is_vae=True):
920
- """Make binary accuracy metric with missing mask.
921
-
922
- Args:
923
- class_weight (Dict[int, float], optional): Class weights to reduce class imbalance. Defaults to None.
924
-
925
- is_vae (bool, optional): Whether model is VAE or not. Defaults to True.
926
-
927
- Returns:
928
- callable: Function that calculates categorical crossentropy loss.
929
- """
930
-
931
- @tf.function
932
- def masked_binary_accuracy(y_true, y_pred, sample_weight=None):
933
- """Custom neural network metric function with missing mask.
934
-
935
- Ignores missing data in the calculation of the loss function.
936
-
937
- Args:
938
- y_true (tensorflow.Tensor): Input multilabel encoded 3D tensor.
939
- y_pred (tensorflow.Tensor): Predicted values from model.
940
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
941
-
942
- Returns:
943
- float: Binary accuracy calculated with missing data masked.
944
- """
945
- return ba(
946
- y_true,
947
- y_pred,
948
- sample_weight=sample_weight,
949
- )
950
-
951
- return masked_binary_accuracy
952
-
953
- @staticmethod
954
- def make_masked_binary_crossentropy(class_weight=None, is_vae=True):
955
- """Make binary crossentropy loss function with missing mask.
956
-
957
- Args:
958
- class_weight (Dict[int, float], optional): Class weights to reduce class imbalance. Defaults to None.
959
-
960
- is_vae (bool, optional): Whether model is VAE or not. Defaults to True.
961
-
962
- Returns:
963
- callable: Function that calculates categorical crossentropy loss.
964
- """
965
-
966
- @tf.function
967
- def masked_binary_crossentropy(y_true, y_pred, sample_weight=None):
968
- """Custom loss function for with missing mask applied.
969
-
970
- Ignores missing data in the calculation of the loss function.
971
-
972
- Args:
973
- y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
974
-
975
- y_pred (tensorflow.tensor): Predicted values, should have undergone sigmoid activation.
976
-
977
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
978
-
979
- Returns:
980
- float: Binary crossentropy loss value.
981
- """
982
- return bce(
983
- y_true,
984
- y_pred,
985
- sample_weight=sample_weight,
986
- )
987
-
988
- return masked_binary_crossentropy
989
-
990
- @staticmethod
991
- def make_masked_categorical_accuracy():
992
- """Make categorical crossentropy loss function with missing mask.
993
-
994
- Args:
995
- class_weight (Dict[int, float): Weights for each class.
996
- is_vae (bool, optional): Whether using VAE model. Defaults to False.
997
-
998
- Returns:
999
- callable: Function that calculates categorical crossentropy loss.
1000
- """
1001
-
1002
- @tf.function
1003
- def masked_categorical_accuracy(y_true, y_pred, sample_weight=None):
1004
- """Custom loss function for neural network model with missing mask.
1005
- Ignores missing data in the calculation of the loss function.
1006
- Args:
1007
- y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
1008
- y_pred (tensorflow.tensor): Predicted values.
1009
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
1010
-
1011
- Returns:
1012
- float: Mean squared error loss value with missing data masked.
1013
- """
1014
- # # Mask out missing values.
1015
- # y_true_masked = tf.boolean_mask(
1016
- # y_true,
1017
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1018
- # )
1019
-
1020
- # y_pred_masked = tf.boolean_mask(
1021
- # y_pred,
1022
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1023
- # )
1024
-
1025
- return cca(
1026
- y_true,
1027
- y_pred,
1028
- sample_weight=sample_weight,
1029
- )
1030
-
1031
- return masked_categorical_accuracy
1032
-
1033
- @staticmethod
1034
- def make_masked_categorical_crossentropy():
1035
- """Make categorical crossentropy loss function with missing mask.
1036
-
1037
- Returns:
1038
- callable: Function that calculates categorical crossentropy loss.
1039
- """
1040
-
1041
- @tf.function
1042
- def masked_categorical_crossentropy(
1043
- y_true, y_pred, sample_weight=None
1044
- ):
1045
- """Custom loss function for neural network model with missing mask.
1046
- Ignores missing data in the calculation of the loss function.
1047
-
1048
- Args:
1049
- y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
1050
- y_pred (tensorflow.tensor): Predicted values.
1051
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
1052
-
1053
- Returns:
1054
- float: Mean squared error loss value with missing data masked.
1055
- """
1056
- # Mask out missing values.
1057
- # y_true_masked = tf.boolean_mask(
1058
- # y_true,
1059
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1060
- # )
1061
-
1062
- # y_pred_masked = tf.boolean_mask(
1063
- # y_pred,
1064
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1065
- # )
1066
-
1067
- return cce(
1068
- y_true,
1069
- y_pred,
1070
- sample_weight=sample_weight,
1071
- )
1072
-
1073
- return masked_categorical_crossentropy
1074
-
1075
- @staticmethod
1076
- def kl_divergence(z_mean, z_log_var, kl_weight=0.5):
1077
- kl_loss = -0.5 * (
1078
- 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
1079
- )
1080
- return tf.reduce_mean(tf.reduce_sum(kl_loss, axis=-1))
1081
-
1082
- # Another way of doing it.
1083
- # TODO: Test both ways.
1084
- # z_sigma = tf.math.exp(0.5 * z_log_var)
1085
- # return tf.reduce_sum(
1086
- # tf.math.square(z_mean) + tf.math.square(z_sigma) - z_log_var - 1.0,
1087
- # axis=-1,
1088
- # )
1089
-
1090
- def make_reconstruction_loss(self):
1091
- """Make loss function for use with a keras model.
1092
-
1093
- Returns:
1094
- callable: Function that calculates loss.
1095
- """
1096
-
1097
- def reconstruction_loss(input_and_mask, y_pred):
1098
- """Custom loss function for neural network model with missing mask.
1099
-
1100
- Ignores missing data in the calculation of the loss function.
1101
-
1102
- Args:
1103
- input_and_mask (numpy.ndarray): Input one-hot encoded array with missing values also one-hot encoded and h-stacked.
1104
-
1105
- y_pred (numpy.ndarray): Predicted values.
1106
-
1107
- Returns:
1108
- float: Mean squared error loss value with missing data masked.
1109
- """
1110
- n_features = y_pred.numpy().shape[1]
1111
-
1112
- true_indices = range(n_features)
1113
- missing_indices = range(n_features, n_features * 2)
1114
-
1115
- # Split features and missing mask.
1116
- y_true = tf.gather(input_and_mask, true_indices, axis=1)
1117
- missing_mask = tf.gather(input_and_mask, missing_indices, axis=1)
1118
-
1119
- observed_mask = tf.subtract(1.0, missing_mask)
1120
- y_true_observed = tf.multiply(y_true, observed_mask)
1121
- pred_observed = tf.multiply(y_pred, observed_mask)
1122
-
1123
- # loss_fn = tf.keras.losses.CategoricalCrossentropy()
1124
- # return loss_fn(y_true_observed, pred_observed)
1125
-
1126
- return tf.keras.metrics.mean_squared_error(
1127
- y_true=y_true_observed, y_pred=pred_observed
1128
- )
1129
-
1130
- return reconstruction_loss
1131
-
1132
- @staticmethod
1133
- def normalize_data(data):
1134
- """Normalize data between 0 and 1."""
1135
- return (data - np.min(data)) / (np.max(data) - np.min(data))
1136
-
1137
- @staticmethod
1138
- def normalize_sum_to_1(d, target=1.0):
1139
- factor = target / sum(d.values())
1140
- return {k: v * factor for k, v in d.items()}
1141
-
1142
- @staticmethod
1143
- def smooth_weights(d, mu=0.15):
1144
- total = np.sum(list(d.values()))
1145
- keys = d.keys()
1146
- class_weight = dict()
1147
-
1148
- for k in keys:
1149
- score = math.log(mu * total / float(d[k]))
1150
- class_weight[k] = score if score > 1.0 else 1.0
1151
-
1152
- return class_weight
1153
-
1154
- @classmethod
1155
- def get_class_weights(
1156
- cls,
1157
- y_true,
1158
- original_missing_mask,
1159
- user_weights=None,
1160
- return_1d=False,
1161
- method="auto",
1162
- ):
1163
- """Get class weights for each column in a 2D matrix.
1164
-
1165
- Args:
1166
- y_true (numpy.ndarray): True target values.
1167
-
1168
- original_missing_mask (numpy.ndarray): Boolean mask with missing values set to True and non-missing to False.
1169
-
1170
- user_weights (Dict[int, float], optional): Class weights if user-provided.
1171
-
1172
- return_1d (bool, optional): If True, returns a dictionary of class weights, with integer encodings as keys and the corresponding class weights as keys. If False, returns 2D sample_weight matrix. Defaults to False.
1173
-
1174
- Returns:
1175
- numpy.ndarray or Dict[int, float]: Sample weights per column of shape (n_samples, n_features) if return_1d is False. Dictionary of class weights if True.
1176
- """
1177
- # Get list of class_weights (per-column).
1178
- class_weights = list()
1179
- sample_weight = np.zeros(y_true.shape)
1180
- if user_weights is not None:
1181
- # Set user-defined sample_weights
1182
- for k in user_weights.keys():
1183
- sample_weight[y_true == k] = user_weights[k]
1184
-
1185
- elif return_1d:
1186
- y_true_1d = y_true.flatten()
1187
-
1188
- if method == "auto":
1189
- sample_weight = dict(
1190
- zip(
1191
- np.unique(y_true_1d),
1192
- compute_class_weight(
1193
- "balanced",
1194
- classes=np.unique(y_true_1d),
1195
- y=y_true_1d,
1196
- ),
1197
- )
1198
- )
1199
-
1200
- elif method == "logsmooth":
1201
- counts = np.unique(y_true_1d, return_counts=True)
1202
- sample_weight = dict(zip(counts[0], counts[1]))
1203
- sample_weight.pop(-9)
1204
- sample_weight = cls.smooth_weights(sample_weight)
1205
- sample_weight[-9] = 0.0
1206
-
1207
- else:
1208
- # Automatically get class weights to set sample_weight.
1209
- for i in np.arange(y_true.shape[1]):
1210
- mm = ~original_missing_mask[:, i]
1211
- classes = np.unique(y_true[mm, i])
1212
- cw = compute_class_weight(
1213
- "balanced",
1214
- classes=classes,
1215
- y=y_true[mm, i],
1216
- )
1217
-
1218
- class_weights.append({k: v for k, v in zip(classes, cw)})
1219
-
1220
- # Make sample_weight_matrix from automatic per-column class_weights.
1221
- for i, w in enumerate(class_weights):
1222
- for j in range(3):
1223
- if j in w:
1224
- sample_weight[y_true[:, i] == j, i] = w[j]
1225
-
1226
- return sample_weight
1227
-
1228
- @staticmethod
1229
- def write_gt_state_probs(
1230
- y_pred,
1231
- y_pred_1d,
1232
- y_true,
1233
- y_true_1d,
1234
- nn_method,
1235
- sim_missing_mask,
1236
- original_missing_mask,
1237
- prefix="imputer",
1238
- ):
1239
- bin_mapping = np.array(
1240
- [np.array2string(x) for row in y_pred for x in row]
1241
- )
1242
-
1243
- bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
1244
-
1245
- y_true_2d = np.reshape(y_true_1d, y_true.shape)
1246
- bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
1247
- y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
1248
-
1249
- include = np.logical_and(sim_missing_mask, ~original_missing_mask)
1250
-
1251
- gt_dist = list()
1252
- colors = []
1253
- for yt, yp, ypd, mask in zip(
1254
- y_true_2d,
1255
- bin_mapping_2d,
1256
- y_pred_2d,
1257
- include,
1258
- ):
1259
- sites = dict()
1260
- row_colors = []
1261
- for i, (yt_site, mask_site) in enumerate(zip(yt, mask)):
1262
- if mask_site:
1263
- sites[
1264
- f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1265
- ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1266
- if ypd[i] == yt_site:
1267
- row_colors.append("blue")
1268
- else:
1269
- sites[
1270
- f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1271
- ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1272
- row_colors.append("orange")
1273
- else:
1274
- sites[
1275
- f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1276
- ] = f"{i},{np.array2string(np.array([0.0, 0.0, 0.0]))},0,0"
1277
- row_colors.append("gray")
1278
- gt_dist.append(sites)
1279
- colors.append(row_colors)
1280
-
1281
- gt_df = pd.DataFrame.from_records(gt_dist)
1282
- gt_df.to_csv(
1283
- os.path.join(
1284
- f"{prefix}_output",
1285
- "logs",
1286
- "Unsupervised",
1287
- nn_method,
1288
- "genotype_state_proba.csv",
1289
- ),
1290
- index=False,
1291
- header=False,
1292
- )
1293
-
1294
- # Reload the data
1295
-
1296
- data = pd.read_csv(
1297
- os.path.join(
1298
- f"{prefix}_output",
1299
- "logs",
1300
- "Unsupervised",
1301
- nn_method,
1302
- "genotype_state_proba.csv",
1303
- ),
1304
- header=None,
1305
- )
1306
-
1307
- # Parse the original data into separate dataframes for imputedGT and expectedGT
1308
- imputedGT_data = data.applymap(lambda x: int(x.split(",")[2]))
1309
- expectedGT_data = data.applymap(lambda x: int(x.split(",")[3]))
1310
-
1311
- # Determine the binary mask based on whether imputedGT and expectedGT are the same
1312
-
1313
- mask = imputedGT_data == expectedGT_data
1314
-
1315
- # Create a new figure and set its size
1316
- plt.figure(figsize=(12, 6))
1317
-
1318
- from matplotlib.colors import ListedColormap
1319
-
1320
- rgb_colors = sns.color_palette(
1321
- [color for sublist in colors for color in sublist]
1322
- )
1323
- cmap = ListedColormap(rgb_colors)
1324
-
1325
- # Create a heatmap
1326
- sns.heatmap(mask, cmap=cmap, cbar=False)
1327
-
1328
- # Set the title and labels
1329
- plt.title("Expected Genotypes for Simulated Genotypes")
1330
- plt.xlabel("Column Index")
1331
- plt.ylabel("Row Index")
1332
-
1333
- # Create a custom legend
1334
- import matplotlib.patches as mpatches
1335
-
1336
- green_patch = mpatches.Patch(color="blue", label="Agreement")
1337
- orange_patch = mpatches.Patch(color="orange", label="Disagreement")
1338
- gray_patch = mpatches.Patch(color="gray", label="Not Simulated")
1339
-
1340
- plt.legend(
1341
- handles=[green_patch, orange_patch, gray_patch], loc="lower right"
1342
- )
1343
-
1344
- outfile = os.path.join(
1345
- f"{prefix}_output",
1346
- "plots",
1347
- "Unsupervised",
1348
- nn_method,
1349
- "gt_state_proba.png",
1350
- )
1351
-
1352
- plt.savefig(outfile, bbox_inches="tight", facecolor="white")
1353
-
1354
- # @staticmethod
1355
- # def write_gt_state_probs(
1356
- # y_pred,
1357
- # y_pred_1d,
1358
- # y_true,
1359
- # y_true_1d,
1360
- # nn_method,
1361
- # sim_missing_mask,
1362
- # original_missing_mask,
1363
- # prefix="imputer",
1364
- # ):
1365
- # bin_mapping = np.array(
1366
- # [np.array2string(x) for row in y_pred for x in row]
1367
- # )
1368
-
1369
- # bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
1370
-
1371
- # y_true_2d = np.reshape(y_true_1d, y_true.shape)
1372
- # bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
1373
- # y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
1374
-
1375
- # gt_dist = list()
1376
- # for yt, yp, ypd in zip(y_true_2d, bin_mapping_2d, y_pred_2d):
1377
- # sites = dict()
1378
- # for i, yt_site in enumerate(yt):
1379
- # sites[
1380
- # f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1381
- # ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1382
- # gt_dist.append(sites)
1383
-
1384
- # gt_df = pd.DataFrame.from_records(gt_dist)
1385
- # gt_df.to_csv(
1386
- # os.path.join(
1387
- # f"{prefix}_output",
1388
- # "logs",
1389
- # "Unsupervised",
1390
- # nn_method,
1391
- # "genotype_state_proba.csv",
1392
- # ),
1393
- # index=False,
1394
- # header=False,
1395
- # )