pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
@@ -1,1549 +0,0 @@
1
- import logging
2
- import math
3
- import os
4
- import sys
5
- import random
6
- import warnings
7
-
8
- warnings.simplefilter(action="ignore", category=FutureWarning)
9
-
10
-
11
- from pathlib import Path
12
-
13
- import numpy as np
14
- import pandas as pd
15
- import seaborn as sns
16
- import matplotlib.pyplot as plt
17
-
18
- # Create a custom legend
19
- import matplotlib.patches as mpatches
20
- import matplotlib.colors as mcolors
21
- from matplotlib.colors import ListedColormap
22
-
23
- from sklearn.utils.class_weight import (
24
- compute_class_weight,
25
- )
26
-
27
- from sklearn.metrics import f1_score
28
-
29
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
30
- logging.getLogger("tensorflow").disabled = True
31
- warnings.filterwarnings("ignore", category=UserWarning)
32
-
33
- # noinspection PyPackageRequirements
34
- import tensorflow as tf
35
-
36
- # Disable can't find cuda .dll errors. Also turns of GPU support.
37
- tf.config.set_visible_devices([], "GPU")
38
-
39
- from tensorflow.python.util import deprecation
40
-
41
- # Disable warnings and info logs.
42
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
43
- tf.get_logger().setLevel(logging.ERROR)
44
-
45
-
46
- # Monkey patching deprecation utils to supress warnings.
47
- # noinspection PyUnusedLocal
48
- def deprecated(
49
- date, instructions, warn_once=True
50
- ): # pylint: disable=unused-argument
51
- def deprecated_wrapper(func):
52
- return func
53
-
54
- return deprecated_wrapper
55
-
56
-
57
- deprecation.deprecated = deprecated
58
-
59
-
60
- class DisabledCV:
61
- def __init__(self):
62
- self.n_splits = 1
63
-
64
- def split(self, X, y, groups=None):
65
- yield (np.arange(len(X)), np.arange(len(y)))
66
-
67
- def get_n_splits(self, X, y, groups=None):
68
- return self.n_splits
69
-
70
-
71
- # For VAE.
72
- # Necessary to initialize outside of class for use with tf.function decorator.
73
- cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
74
- cca = tf.keras.metrics.CategoricalAccuracy()
75
- ba = tf.keras.metrics.BinaryAccuracy()
76
- bce = tf.keras.losses.BinaryCrossentropy()
77
-
78
-
79
- class NeuralNetworkMethods:
80
- """Methods common to all neural network imputer classes and loss functions"""
81
-
82
- def __init__(self):
83
- self.data = None
84
-
85
- @staticmethod
86
- def encode_multilab(X, num_classes=4):
87
- """Encode 0-9 integer data in one-hot format.
88
- Args:
89
- X (numpy.ndarray): Input array with 012-encoded data and -9 as the missing data value.
90
-
91
- num_classes (int, optional): Number of multi-label classes to use. Mostly for compatibility with encode_multiclass. Defaults to 4.
92
- Returns:
93
- pandas.DataFrame: One-hot encoded data, ignoring missing values (np.nan). multi-label categories will be encoded as 0.5. Otherwise, it will be 1.0.
94
- """
95
- # return np.where(X >= 0.5, 1.0, 0.0)
96
- try:
97
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], 4))
98
- except IndexError:
99
- Xt = np.zeros(shape=(X.shape[0],))
100
-
101
- mappings = {
102
- 0: [1.0, 0.0, 0.0, 0.0],
103
- 1: [0.0, 1.0, 0.0, 0.0],
104
- 2: [0.0, 0.0, 1.0, 0.0],
105
- 3: [0.0, 0.0, 0.0, 1.0],
106
- 4: [1.0, 1.0, 0.0, 0.0],
107
- 5: [1.0, 0.0, 1.0, 0.0],
108
- 6: [1.0, 0.0, 0.0, 1.0],
109
- 7: [0.0, 1.0, 1.0, 0.0],
110
- 8: [0.0, 1.0, 0.0, 1.0],
111
- 9: [0.0, 0.0, 1.0, 1.0],
112
- -9: [np.nan, np.nan, np.nan, np.nan],
113
- }
114
- try:
115
- for row in np.arange(X.shape[0]):
116
- Xt[row] = [mappings[enc] for enc in X[row]]
117
- except TypeError:
118
- Xt = [mappings[enc] for enc in X]
119
-
120
- if not isinstance(Xt, np.ndarray):
121
- Xt = np.array(Xt)
122
- return Xt
123
-
124
- @staticmethod
125
- def encode_multiclass(X, num_classes=10, missing_value=-9):
126
- """Encode 0-9 integer data in multi-class one-hot format.
127
-
128
- Missing values get encoded as ``[np.nan] * num_classes``
129
- Args:
130
- X (numpy.ndarray): Input array with 012-encoded data and ``missing_value`` as the missing data value.
131
-
132
- num_classes (int, optional): Number of classes to use. Defaults to 10.
133
-
134
- missing_value (int, optional): Missing data value to replace with ``[np.nan] * num_classes``\. Defaults to -9.
135
- Returns:
136
- pandas.DataFrame: Multi-class one-hot encoded data, ignoring missing values (np.nan).
137
- """
138
- int_cats, ohe_arr = np.arange(num_classes), np.eye(num_classes)
139
- mappings = dict(zip(int_cats, ohe_arr))
140
- mappings[missing_value] = np.array([np.nan] * num_classes)
141
-
142
- try:
143
- Xt = np.zeros(shape=(X.shape[0], X.shape[1], num_classes))
144
- except IndexError:
145
- Xt = np.zeros(shape=(X.shape[0],))
146
-
147
- try:
148
- for row in np.arange(X.shape[0]):
149
- Xt[row] = [mappings[enc] for enc in X[row]]
150
- except TypeError:
151
- Xt = [mappings[enc] for enc in X]
152
-
153
- if not isinstance(Xt, np.ndarray):
154
- Xt = np.array(Xt)
155
-
156
- return Xt
157
-
158
- @classmethod
159
- def decode_masked(
160
- cls,
161
- y_true_bin,
162
- y_pred_proba,
163
- is_multiclass=True,
164
- return_proba=False,
165
- return_multilab=False,
166
- return_int=True,
167
- predict_still_missing=True,
168
- threshold_increment=0.1,
169
- multilabel_averaging="macro",
170
- missing_mask=None,
171
- ):
172
- """Evaluate model predictions by decoding from one-hot encoding to integer-encoded format.
173
-
174
- Gets the index of the highest predicted value to obtain the integer encodings or integer encodings.
175
-
176
- Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
177
-
178
- Args:
179
- y_true_bin (numpy.ndarray): True multilabel target values of shape (n_samples * n_features, num_classes). Array should be flattened and masked.
180
-
181
- y_pred_proba (numpy.ndarray): Multilabel model predictions of shape (n_samples * n_features, num_classes). Array should be flattened and masked.
182
-
183
- is_multiclass (bool, optional): True if using multiclass data with softmax activation. False if using multilabel data with sigmoid activation. Defaults to True.
184
-
185
- threshold (float, optional): If using multilabel, then set the threshold for determining 1 or 0 predictions. Defaults to 0.5.
186
-
187
- return_proba (bool, optional): If True, returns probabilities for unresolved values where all multilabel probabilities were below the threshold. Defaults to False.
188
-
189
- return_multilab (bool, optional): If True, returns the multilabel encodings instead of integer encodings (if doing multilabel classification). Defaults to False.
190
-
191
- return_int (bool, optional): If True, returns the integer encodings instead of onehot encodings (if doing multiclass classification). Defaults to False.
192
-
193
- predict_still_missing (bool, optional): If True, values that are still missing after decoding are decoded using the maximum probability (i.e., with np.argmax). If False, then it is possible that some missing data might still remain after decoding if none of the multilabel probabilities are above the threshold. Defaults to True.
194
-
195
- threshold_increment (float, optional): How much to increment threshold when searching for optimal threshold. Should be > 0 and < 1. Defaults to 0.05.
196
-
197
- multilabel_averaging (str): Method to use for averaging F1 score among multilabel classes. Supported options are: {"macro", "micro", "weighted", "samples"}. Defaults to "macro".
198
-
199
- missing_mask (numpy.ndarray, optional): Missing mask with missing values encoded as 1's and nonmissing as 0. Only used if not None. Defaults to None.
200
-
201
- Returns:
202
- numpy.ndarray: Imputed integer-encoded values.
203
-
204
- numpy.ndarray (optional): Probabilities for each call, with those above the threshold set to 1.0 and those below the threshold between 0 and 1.
205
- """
206
-
207
- if return_int and return_multilab:
208
- raise ValueError(
209
- "return_int and return_multilab cannot both be True."
210
- )
211
-
212
- y_unresolved_certainty = None
213
- if is_multiclass or y_true_bin.shape[-1] == 10:
214
- # Softmax predictions.
215
- # If reduce_dim is True, will return integer encodings.
216
- # Otherwise, returns one-hot encodings.
217
- y_pred = cls.decode_multiclass(y_pred_proba, reduce_dim=return_int)
218
- else:
219
- # Onehot encode if not already one-hot encoded.
220
- if y_true_bin.shape[-1] != 4:
221
- if y_true_bin.shape[-1] != 10:
222
- y_true_bin = cls.encode_multilab(y_true_bin)
223
- else:
224
- y_true_bin = cls.encode_multiclass(y_true_bin)
225
-
226
- pred_multilab = cls.zero_extra_categories(y_pred_proba)
227
-
228
- # Binary multilabel predictions.
229
- threshold = cls.get_optimal_threshold(
230
- y_true_bin,
231
- pred_multilab,
232
- increment=threshold_increment,
233
- average_method=multilabel_averaging,
234
- )
235
-
236
- # Call 0s and 1s based on threshold.
237
- pred_multilab = np.where(pred_multilab >= threshold, 1.0, 0.0)
238
-
239
- pred_multilab_decoded = cls.decode_binary_multilab(pred_multilab)
240
-
241
- if predict_still_missing:
242
- # Check if there are still any missing values.
243
- still_missing = np.all(pred_multilab == 0, axis=-1)
244
-
245
- if return_multilab:
246
- still_missing_bin = np.all(
247
- pred_multilab == 0, axis=-1, keepdims=True
248
- )
249
-
250
- # Do multiclass prediction with argmax then get the probabilities
251
- # if any unresolved values.
252
- if np.any(still_missing):
253
- # Get the argmax with the highest probability if
254
- # all classes are below threshold.
255
- y_multi = cls.decode_multiclass(y_pred_proba)
256
- y_multi_bin = cls.decode_multiclass(
257
- y_pred_proba, reduce_dim=False
258
- )
259
-
260
- try:
261
- y_pred = np.where(
262
- still_missing, y_multi, pred_multilab_decoded
263
- )
264
- except ValueError:
265
- y_pred = np.where(
266
- still_missing,
267
- y_multi,
268
- np.reshape(
269
- pred_multilab_decoded, still_missing.shape
270
- ),
271
- )
272
-
273
- if return_multilab:
274
- y_pred_bin = np.where(
275
- still_missing_bin, y_multi_bin, pred_multilab
276
- )
277
-
278
- y_pred = y_pred_bin
279
-
280
- if return_proba:
281
- # Get max value as base call.
282
- y_pred_proba_max = y_pred_proba.max(axis=-1)
283
-
284
- # Get probability of max value that was < threshold.
285
- y_unresolved_certainty = np.where(
286
- still_missing, y_pred_proba_max, 1.0
287
- )
288
-
289
- else:
290
- if return_multilab:
291
- y_pred = pred_multilab
292
- else:
293
- y_pred = pred_multilab_decoded
294
- else:
295
- if return_multilab:
296
- y_pred = pred_multilab
297
- else:
298
- y_pred = pred_multilab_decoded
299
-
300
- y_pred = y_pred.astype(int)
301
-
302
- if return_proba:
303
- return y_pred, y_unresolved_certainty
304
- else:
305
- return y_pred
306
-
307
- @classmethod
308
- def get_optimal_threshold(
309
- cls,
310
- y_true_bin,
311
- y_pred_proba,
312
- increment=0.01,
313
- average_method="macro",
314
- ):
315
- """Increment to find the optimal decoding threshold.
316
-
317
- Args:
318
- y_true_bin (numpy.ndarray): True multilabel values of shape (n_samples * n_features, num_classes).
319
-
320
- y_pred_proba (numpy.ndarray): Multilabel prediction probabilities of shape (n_features * n_samples, num_classes).
321
-
322
- increment (float, optional): How much to increment when searching for optimal threshold. Should be > 0 and < 1. Defaults to 0.1.
323
-
324
- average_method (str, optional): Method to use for averaging the F1 score across multilabel classes. Possible options include {"macro", "micro", "weighted", "samples"}. Defaults to "macro".
325
-
326
- Returns:
327
- float: Optimal decoding threshold.
328
- """
329
- y_true = y_true_bin.copy()
330
- y_pred = y_pred_proba.copy()
331
-
332
- thresholds = np.arange(increment, 1, increment)
333
-
334
- nonmissing_mask = np.where(y_true_bin != -1)
335
- num_classes = y_true_bin.shape[-1]
336
-
337
- # This is only supposed to get applied during the final transform,
338
- # when the original missing data is replaced with predictions.
339
- # If this isn't done here, it ends up having -1 values in it,
340
- # which causes the f1_score function to throw an error.
341
-
342
- try:
343
- y_true = y_true[nonmissing_mask]
344
- y_pred = y_pred[nonmissing_mask]
345
- except IndexError:
346
- pass
347
-
348
- # Call 0s and 1s based on threshold.
349
-
350
- scores = list()
351
- for t in thresholds:
352
- pred_multilab = np.where(y_pred >= t, 1.0, 0.0)
353
- pred_multilab_decoded = cls.decode_binary_multilab(pred_multilab)
354
- true_multilab_decoded = cls.decode_binary_multilab(y_true)
355
-
356
- # Had to cast them as integers to get rid of a type error during the
357
- # final transform() function.
358
-
359
- scores.append(
360
- f1_score(
361
- true_multilab_decoded,
362
- pred_multilab_decoded,
363
- average="weighted",
364
- )
365
- )
366
-
367
- return thresholds[np.argmax(scores)]
368
-
369
- @classmethod
370
- def flatten_bin_encodings(cls, y):
371
- """Flatten first two dimensions of binary encodings to (num_samples * num_features, num_classes).
372
-
373
- Args:
374
- y (numpy.ndarray): Numpy array with 3-dimensional shape of (n_samples, num_features, num_classes).
375
-
376
- Returns:
377
- numpy.ndarray: Array of shape (n_samples * num_features, num_classes).
378
-
379
- Raises:
380
- ValueError: Input shape must be 3-dimensional.
381
- """
382
- if len(y.shape) != 3:
383
- raise ValueError("Input array must be 3-dimensional")
384
-
385
- return y.reshape(y.shape[0] * y.shape[1], y.shape[2])
386
-
387
- @staticmethod
388
- def zero_extra_categories(y_pred_proba, threshold=0.5):
389
- """Check if any prediction probabilities have >2 values above threshold.
390
-
391
- If >2, then it sets the two with the lowest probabilities to 0.0.
392
-
393
- Args:
394
- y_pred_proba (numpy.ndarray): Prediction probabilities (sigmoid activation) of shape (n_samples, n_features, num_classes) or (n_samples * n_features, num_classes).
395
-
396
- pred_multilab (numpy.ndarray): Multi-label decodings. Inner arrays should have only 0s and 1s. Should be of shape (n_samples, n_features, num_classes) or (n_samples * n_features, num_classes).
397
-
398
- threshold (float, optional): Threshold to use to set decoded multilabel values to 0s (< threshold) or 1s (>= threshold). Defaults to 0.5.
399
- """
400
- N = 2
401
- y_pred_proba[y_pred_proba.argsort().argsort() < N] = 0.0
402
- return y_pred_proba
403
- # idx = np.argpartition(y_pred_proba.ravel(), k)
404
- # indices = tuple(
405
- # np.array(np.unravel_index(idx, y_pred_proba.shape))[
406
- # :, range(min(k, 0), max(k, 0))
407
- # ]
408
- # )
409
-
410
- # y_pred_proba[indices] = 0.0
411
- # return y_pred_proba
412
-
413
- # return np.where(y_pred_proba >= threshold, 1.0, 0.0)
414
-
415
- @classmethod
416
- def decode_multiclass(cls, y_pred_proba, reduce_dim=True):
417
- """Decode probabilities to either one-hot or integer encodings.
418
-
419
- Args:
420
- y_pred_proba (numpy.ndarray): Probabilities to decode.
421
-
422
- reduce_dim (bool, optional): If True, returns integer encodings of one fewer dimension than ``y_pred_proba``\. Otherwise, returns one-hot encodings where the class with the maximum probability is a 1 and every other class is 0. Defaults to True.
423
-
424
- Returns:
425
- numpy.ndarray: Integer or one-hot-encoded predictions.
426
- """
427
- yt = np.apply_along_axis(cls.mle, axis=-1, arr=y_pred_proba)
428
- if reduce_dim:
429
- return np.argmax(yt, axis=-1)
430
- else:
431
- return yt
432
-
433
- @classmethod
434
- def decode_binary_multilab(cls, y_pred):
435
- """Decode multi-label sigmoid probabilities to integer encodings.
436
-
437
- The predictions should have already undergone sigmoid activation and should be probabilities.
438
-
439
- If sigmoid activation output is >0.5, gets encoded as 1.0; else 0.0. If more than one category is > 0.5, then it is a heterozygote.
440
-
441
- Args:
442
- y_pred (numpy.ndarray): Model predictions of shape (n_samples * n_features, num_classes) or (n_samples, n_features, num_classes). A threshold should already have been applied to set each class to 0 or 1.
443
-
444
- Returns:
445
- numpy.ndarray: Integer-decoded multilabel predictions of shape (n_samples * n_features) or (n_samples, n_features).
446
- """
447
- y_pred_idx = y_pred.astype(int)
448
- y_pred_idx = y_pred_idx.astype(str)
449
-
450
- if len(y_pred_idx.shape) < 3:
451
- y_pred_idx = np.array(
452
- [
453
- "".join(np.atleast_1d(row == "1").nonzero()[0].astype(str))
454
- for row in y_pred_idx
455
- ]
456
- )
457
- else:
458
- y_pred_idx = np.array(
459
- [
460
- "".join(np.atleast_1d(col == "1").nonzero()[0].astype(str))
461
- for row in y_pred_idx
462
- for col in row
463
- ]
464
- )
465
-
466
- try:
467
- Xt = np.zeros(shape=(y_pred.shape[0], y_pred.shape[1], 4))
468
- except IndexError:
469
- Xt = np.zeros(shape=(y_pred.shape[0],))
470
-
471
- mappings = {
472
- "0": 0,
473
- "1": 1,
474
- "2": 2,
475
- "3": 3,
476
- "01": 4,
477
- "02": 5,
478
- "03": 6,
479
- "12": 7,
480
- "13": 8,
481
- "23": 9,
482
- "-9": -9,
483
- "": -9,
484
- }
485
-
486
- Xt = [mappings[enc] for enc in y_pred_idx]
487
-
488
- if not isinstance(Xt, np.ndarray):
489
- Xt = np.array(Xt)
490
- return Xt
491
-
492
- @staticmethod
493
- def encode_categorical(X):
494
- """Encode -9 encoded missing values as np.nan.
495
-
496
- Args:
497
- X (numpy.ndarray): 012-encoded genotypes with -9 as missing values.
498
-
499
- Returns:
500
- pandas.DataFrame: DataFrame with missing values encoded as np.nan.
501
- """
502
- np.nan_to_num(X, copy=False, nan=-9.0)
503
- X = X.astype(str)
504
- X[(X == "-9.0") | (X == "-9")] = "none"
505
-
506
- df = pd.DataFrame(X)
507
- df_incomplete = df.copy()
508
-
509
- # Replace 'none' with np.nan
510
- for row in df.index:
511
- for col in df.columns:
512
- if df_incomplete.iat[row, col] == "none":
513
- df_incomplete.iat[row, col] = np.nan
514
-
515
- return df_incomplete
516
-
517
- @staticmethod
518
- def mle(row):
519
- """Get the Maximum Likelihood Estimation for the best prediction. Basically, it sets the index of the maxiumum value in a vector (row) to 1.0, since it is one-hot encoded.
520
-
521
- Args:
522
- row (numpy.ndarray(float)): Row vector with predicted values as floating points.
523
-
524
- Returns:
525
- numpy.ndarray(float): Row vector with the highest prediction set to 1.0 and the others set to 0.0.
526
- """
527
- res = np.zeros(row.shape[0])
528
- res[np.argmax(row)] = 1
529
- return res
530
-
531
- @classmethod
532
- def predict(cls, X, complete_encoded):
533
- """Evaluate VAE predictions by calculating the highest predicted value.
534
-
535
- Calucalates highest predicted value for each row vector and each class, setting the most likely class to 1.0.
536
-
537
- Args:
538
- X (numpy.ndarray): Input 012-encoded data.
539
-
540
- complete_encoded (numpy.ndarray): Output one-hot encoded data with the maximum predicted values for each class set to 1.0.
541
-
542
- Returns:
543
- numpy.ndarray: Imputed one-hot encoded values.
544
-
545
- pandas.DataFrame: One-hot encoded pandas DataFrame with no missing values.
546
- """
547
-
548
- df = cls.encode_categorical(X)
549
-
550
- # Had to add dropna() to count unique classes while ignoring np.nan
551
- col_classes = [len(df[c].dropna().unique()) for c in df.columns]
552
- df_dummies = pd.get_dummies(df)
553
- mle_complete = None
554
- for i, cnt in enumerate(col_classes):
555
- start_idx = int(sum(col_classes[0:i]))
556
- col_completed = complete_encoded[:, start_idx : start_idx + cnt]
557
- mle_completed = np.apply_along_axis(
558
- cls.mle, axis=1, arr=col_completed
559
- )
560
-
561
- if mle_complete is None:
562
- mle_complete = mle_completed
563
-
564
- else:
565
- mle_complete = np.hstack([mle_complete, mle_completed])
566
- return mle_complete, df_dummies
567
-
568
- def validate_hidden_layers(self, hidden_layer_sizes, num_hidden_layers):
569
- """Validate hidden_layer_sizes and verify that it is in the correct format.
570
-
571
- Args:
572
- hidden_layer_sizes (str, int, List[str], or List[int]): Output units for all the hidden layers.
573
-
574
- num_hidden_layers (int): Number of hidden layers to use.
575
-
576
- Returns:
577
- List[int] or List[str]: List of hidden layer sizes.
578
- """
579
- if isinstance(hidden_layer_sizes, (str, int)):
580
- hidden_layer_sizes = [hidden_layer_sizes] * num_hidden_layers
581
-
582
- # If not all integers
583
- elif isinstance(hidden_layer_sizes, list):
584
- if not all(
585
- [isinstance(x, (str, int)) for x in hidden_layer_sizes]
586
- ):
587
- ls = list(set([type(item) for item in hidden_layer_sizes]))
588
- raise TypeError(
589
- f"Variable hidden_layer_sizes must either be None, "
590
- f"an integer or string, or a list of integers or "
591
- f"strings, but got the following type(s): {ls}"
592
- )
593
-
594
- else:
595
- raise TypeError(
596
- f"Variable hidden_layer_sizes must either be, "
597
- f"an integer, a string, or a list of integers or strings, "
598
- f"but got the following type: {type(hidden_layer_sizes)}"
599
- )
600
-
601
- assert (
602
- num_hidden_layers == len(hidden_layer_sizes)
603
- and num_hidden_layers > 0
604
- ), "num_hidden_layers must be the length of hidden_layer_sizes."
605
-
606
- return hidden_layer_sizes
607
-
608
- def get_hidden_layer_sizes(self, n_dims, n_components, hl_func, vae=False):
609
- """Get dimensions of hidden layers.
610
-
611
- Args:
612
- n_dims (int): The number of feature dimensions (columns) (d).
613
-
614
- n_components (int): The number of reduced dimensions (t).
615
-
616
- hl_func (str): The function to use to calculate the hidden layer sizes. Possible options: "midpoint", "sqrt", "log2".
617
-
618
- vae (bool, optional): Whether using the VAE algorithm. If False, then the returned list gets reversed for NLPCA and UBP.
619
-
620
- Returns:
621
- [int, int, int, ...]: [Number of dimensions in hidden layers].
622
-
623
- Raises:
624
- ValueError: Too many hidden layers specified. Repeated reduction of layer sizes dips below n_components.
625
- """
626
- layers = list()
627
- if not isinstance(hl_func, list):
628
- raise TypeError(
629
- f"hl_func must be of type list, but got {type(hl_func)}."
630
- )
631
-
632
- units = n_dims
633
- for func in hl_func:
634
- if func == "midpoint":
635
- units = round((units + n_components) / 2)
636
- elif func == "sqrt":
637
- units = round(math.sqrt(units))
638
- elif func == "log2":
639
- units = round(math.log(units, 2))
640
- elif isinstance(func, int):
641
- units = func
642
- else:
643
- raise ValueError(
644
- f"hidden_layer_sizes must be either integers or any of "
645
- f"the following strings: 'midpoint', "
646
- f"'sqrt', or 'log2', but got {func} of type {type(func)}"
647
- )
648
-
649
- if units <= n_components:
650
- print(
651
- f"WARNING: hidden_layer_size reduction became less than n_components. Using only {len(layers)} hidden layers."
652
- )
653
- break
654
-
655
- assert units > 0 and units < n_dims, (
656
- f"The hidden layer sizes must be > 0 and < the number of "
657
- f"features (i.e., columns) in the dataset, but size was {units}"
658
- )
659
-
660
- layers.append(units)
661
-
662
- assert (
663
- layers
664
- ), "There was an error setting hidden layer sizes. Size list is empty. It is possible that the first 'sqrt' reduction caused units to be <= n_components."
665
-
666
- if not vae:
667
- layers.reverse()
668
-
669
- return layers
670
-
671
- def validate_model_inputs(self, y, missing_mask, output_shape):
672
- """Validate inputs to Keras subclass model.
673
-
674
- Args:
675
- V (numpy.ndarray): Input to refine. Shape: (n_samples, n_components).
676
- y (numpy.ndarray): Target (but actual input data). Shape: (n_samples, n_features).
677
-
678
- y_test (numpy.ndarray): Target test dataset. Should have been imputed with simple imputer and missing data simulated using SimGenotypeData(). Shape: (n_samples, n_features).
679
-
680
- missing_mask (numpy.ndarray): Missing data mask for y.
681
-
682
- missing_mask_test (numpy.ndarray): Missing data mask for y_test.
683
-
684
- output_shape (int): Output shape for hidden layers.
685
-
686
- Raises:
687
- TypeError: V, y, missing_mask, output_shape must not be NoneType.
688
- """
689
- if y is None:
690
- raise TypeError("y must not be NoneType.")
691
-
692
- if missing_mask is None:
693
- raise TypeError("missing_mask must not be NoneType.")
694
-
695
- if output_shape is None:
696
- raise TypeError("output_shape must not be NoneType.")
697
-
698
- def prepare_training_batches(
699
- self,
700
- V,
701
- y,
702
- batch_size,
703
- batch_idx,
704
- trainable,
705
- n_components,
706
- sample_weight,
707
- missing_mask,
708
- ubp=True,
709
- ):
710
- """Prepare training batches in the custom training loop.
711
-
712
- Args:
713
- V (numpy.ndarray): Input to batch subset and refine, of shape (n_samples, n_components) (if doing UBP/NLPCA) or (n_samples, n_features) (if doing VAE).
714
-
715
- y (numpy.ndarray): Target to use to refine input V. shape (n_samples, n_features).
716
-
717
- batch_size (int): Batch size to subset.
718
-
719
- batch_idx (int): Current batch index.
720
-
721
- trainable (bool): Whether tensor v should be trainable.
722
-
723
- n_components (int): Number of principal components used in V.
724
-
725
- sample_weight (List[float] or None): List of floats of shape (n_samples,) with sample weights. sample_weight argument must be passed to fit().
726
-
727
- missing_mask (numpy.ndarray): Boolean array with True for missing values and False for observed values.
728
-
729
- ubp (bool, optional): Whether model is UBP/NLPCA (if True) or VAE (if False). Defaults to True.
730
-
731
- Returns:
732
- tf.Variable: Input tensor v with current batch assigned.
733
- numpy.ndarray: Current batch of target data (actual input) used to refine v.
734
- List[float]: Sample weights
735
- int: Batch starting index.
736
- int: Batch ending index.
737
- numpy.ndarray: Batch of y_train target data of shape (batch_size, n_features, n_classes). Only returned for VAE.
738
- """
739
- # on_train_batch_begin() method.
740
- n_samples = y.shape[0]
741
-
742
- # Get current batch size and range.
743
- # self._batch_idx is set in the UBPCallbacks() callback
744
- batch_start = batch_idx * batch_size
745
- batch_end = (batch_idx + 1) * batch_size
746
- if batch_end > n_samples:
747
- batch_end = n_samples - 1
748
- batch_size = batch_end - batch_start
749
-
750
- if ubp:
751
- # override batches. This model refines the input to fit the output, so
752
- # v_batch and y_true have to be overridden.
753
- y_true = y[batch_start:batch_end, :]
754
-
755
- v_batch = V[batch_start:batch_end, :]
756
- missing_mask_batch = missing_mask[batch_start:batch_end, :]
757
-
758
- if sample_weight is not None:
759
- sample_weight_batch = sample_weight[batch_start:batch_end, :]
760
- else:
761
- sample_weight_batch = None
762
-
763
- v = tf.Variable(
764
- tf.zeros([batch_size, n_components]),
765
- trainable=trainable,
766
- dtype=tf.float32,
767
- )
768
-
769
- # Assign current batch to tf.Variable v.
770
- v.assign(v_batch)
771
-
772
- return (
773
- v,
774
- y_true,
775
- sample_weight_batch,
776
- missing_mask_batch,
777
- batch_start,
778
- batch_end,
779
- )
780
-
781
- else:
782
- # Using VAE.
783
- y_true = y[batch_start:batch_end, :]
784
- v = V[batch_start:batch_end, :]
785
- missing_mask_batch = missing_mask[batch_start:batch_end, :]
786
-
787
- if sample_weight is not None:
788
- sample_weight_batch = sample_weight[batch_start:batch_end, :]
789
- else:
790
- sample_weight_batch = None
791
-
792
- return (
793
- y_true,
794
- sample_weight_batch,
795
- missing_mask_batch,
796
- )
797
-
798
- def validate_batch_size(self, X, batch_size):
799
- """Validate the batch size, and adjust as necessary.
800
-
801
- If the specified batch_size is greater than the number of samples in the input data, it will divide batch_size by 2 until it is less than n_samples.
802
-
803
- Args:
804
- X (numpy.ndarray): Input data of shape (n_samples, n_features).
805
- batch_size (int): Batch size to use.
806
-
807
- Returns:
808
- int: Batch size (adjusted if necessary).
809
- """
810
- if batch_size > X.shape[0]:
811
- while batch_size > X.shape[0]:
812
- print(
813
- "Batch size is larger than the number of samples. "
814
- "Dividing batch_size by 2."
815
- )
816
- batch_size //= 2
817
- return batch_size
818
-
819
- def set_compile_params(
820
- self, optimizer, sample_weights=None, vae=False, act_func="softmax"
821
- ):
822
- """Set compile parameters to use.
823
-
824
- Args:
825
- optimizer (str): Keras optimizer to use. Possible options include: {"adam", "sgd", "adagrad", "adadelta", "adamax", "ftrl", "nadam", "rmsprop"}.
826
-
827
- sample_weights (numpy.ndarray, optional): Sample weight matrix of shape (n_samples, n_features). Defaults to None.
828
-
829
- vae (bool, optional): Whether using the VAE model. Defaults to False.
830
-
831
- act_func (str, optional): Activation function to use. Should be "softmax" if doing multiclass classification, otherwise "sigmoid".
832
-
833
- Returns:
834
- Dict[str, callable] or Dict[str, Any]: Callables if search_mode is True, otherwise instantiated objects.
835
-
836
- Raises:
837
- ValueError: Unsupported optimizer specified.
838
- ValueError: Invalid act_func argument supplied.
839
- """
840
- if optimizer.lower() == "adam":
841
- opt = tf.keras.optimizers.legacy.Adam
842
- elif optimizer.lower() == "sgd":
843
- opt = tf.keras.optimizers.legacy.SGD
844
- elif optimizer.lower() == "adagrad":
845
- opt = tf.keras.optimizers.legacy.Adagrad
846
- elif optimizer.lower() == "adadelta":
847
- opt = tf.keras.optimizers.legacy.Adadelta
848
- elif optimizer.lower() == "adamax":
849
- opt = tf.keras.optimizers.legacy.Adamax
850
- elif optimizer.lower() == "ftrl":
851
- opt = tf.keras.optimizers.legacy.Ftrl
852
- elif optimizer.lower() == "nadam":
853
- opt = tf.keras.optimizers.legacy.Nadam
854
- elif optimizer.lower() == "rmsprop":
855
- opt = tf.keras.optimizers.legacy.RMSProp
856
-
857
- if vae:
858
- if act_func == "softmax":
859
- loss_func = (
860
- NeuralNetworkMethods.make_masked_categorical_crossentropy
861
- )
862
- elif act_func == "sigmoid":
863
- loss_func = (
864
- NeuralNetworkMethods.make_masked_binary_crossentropy
865
- )
866
- else:
867
- raise ValueError(
868
- f"act_func must be either 'softmax' or 'sigmoid', but got {act_func}"
869
- )
870
-
871
- loss = loss_func()
872
- metrics = None
873
-
874
- else:
875
- # Doing grid search. Params are callables.
876
- loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
877
- metrics = [tf.keras.metrics.BinaryAccuracy()]
878
-
879
- return {
880
- "optimizer": opt,
881
- "loss": loss,
882
- "metrics": metrics,
883
- "run_eagerly": False,
884
- }
885
-
886
- @staticmethod
887
- def init_weights(dim1, dim2, w_mean=0, w_stddev=0.01):
888
- """Initialize random weights to use with the model.
889
-
890
- Args:
891
- dim1 (int): Size of first dimension.
892
-
893
- dim2 (int): Size of second dimension.
894
-
895
- w_mean (float, optional): Mean of normal distribution. Defaults to 0.
896
-
897
- w_stddev (float, optional): Standard deviation of normal distribution. Defaults to 0.01.
898
- """
899
- # Get reduced-dimension dataset.
900
- return np.random.normal(loc=w_mean, scale=w_stddev, size=(dim1, dim2))
901
-
902
- def reset_seeds(self):
903
- """Reset random seeds for initializing weights."""
904
- seed1 = np.random.randint(1, 1e6)
905
- seed2 = np.random.randint(1, 1e6)
906
- seed3 = np.random.randint(1, 1e6)
907
- np.random.seed(seed1)
908
- random.seed(seed2)
909
- if tf.__version__[0] == "2":
910
- tf.random.set_seed(seed3)
911
- else:
912
- tf.set_random_seed(seed3)
913
-
914
- @staticmethod
915
- def masked_mse(self, X_true, X_pred, mask):
916
- """Calculates mean squared error with missing values ignored.
917
-
918
- Args:
919
- X_true (numpy.ndarray): One-hot encoded input data.
920
- X_pred (numpy.ndarray): Predicted values.
921
- mask (numpy.ndarray): One-hot encoded missing data mask.
922
-
923
- Returns:
924
- float: Mean squared error calculation.
925
- """
926
- return np.square(np.subtract(X_true[mask], X_pred[mask])).mean()
927
-
928
- @staticmethod
929
- def make_masked_binary_accuracy(class_weight=None, is_vae=True):
930
- """Make binary accuracy metric with missing mask.
931
-
932
- Args:
933
- class_weight (Dict[int, float], optional): Class weights to reduce class imbalance. Defaults to None.
934
-
935
- is_vae (bool, optional): Whether model is VAE or not. Defaults to True.
936
-
937
- Returns:
938
- callable: Function that calculates categorical crossentropy loss.
939
- """
940
-
941
- @tf.function
942
- def masked_binary_accuracy(y_true, y_pred, sample_weight=None):
943
- """Custom neural network metric function with missing mask.
944
-
945
- Ignores missing data in the calculation of the loss function.
946
-
947
- Args:
948
- y_true (tensorflow.Tensor): Input multilabel encoded 3D tensor.
949
- y_pred (tensorflow.Tensor): Predicted values from model.
950
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
951
-
952
- Returns:
953
- float: Binary accuracy calculated with missing data masked.
954
- """
955
- return ba(
956
- y_true,
957
- y_pred,
958
- sample_weight=sample_weight,
959
- )
960
-
961
- return masked_binary_accuracy
962
-
963
- @staticmethod
964
- def make_masked_binary_crossentropy(class_weight=None, is_vae=True):
965
- """Make binary crossentropy loss function with missing mask.
966
-
967
- Args:
968
- class_weight (Dict[int, float], optional): Class weights to reduce class imbalance. Defaults to None.
969
-
970
- is_vae (bool, optional): Whether model is VAE or not. Defaults to True.
971
-
972
- Returns:
973
- callable: Function that calculates categorical crossentropy loss.
974
- """
975
-
976
- @tf.function
977
- def masked_binary_crossentropy(y_true, y_pred, sample_weight=None):
978
- """Custom loss function for with missing mask applied.
979
-
980
- Ignores missing data in the calculation of the loss function.
981
-
982
- Args:
983
- y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
984
-
985
- y_pred (tensorflow.tensor): Predicted values, should have undergone sigmoid activation.
986
-
987
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
988
-
989
- Returns:
990
- float: Binary crossentropy loss value.
991
- """
992
- return bce(
993
- y_true,
994
- y_pred,
995
- sample_weight=sample_weight,
996
- )
997
-
998
- return masked_binary_crossentropy
999
-
1000
- @staticmethod
1001
- def make_masked_categorical_accuracy():
1002
- """Make categorical crossentropy loss function with missing mask.
1003
-
1004
- Args:
1005
- class_weight (Dict[int, float): Weights for each class.
1006
- is_vae (bool, optional): Whether using VAE model. Defaults to False.
1007
-
1008
- Returns:
1009
- callable: Function that calculates categorical crossentropy loss.
1010
- """
1011
-
1012
- @tf.function
1013
- def masked_categorical_accuracy(y_true, y_pred, sample_weight=None):
1014
- """Custom loss function for neural network model with missing mask.
1015
- Ignores missing data in the calculation of the loss function.
1016
- Args:
1017
- y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
1018
- y_pred (tensorflow.tensor): Predicted values.
1019
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
1020
-
1021
- Returns:
1022
- float: Mean squared error loss value with missing data masked.
1023
- """
1024
- # # Mask out missing values.
1025
- # y_true_masked = tf.boolean_mask(
1026
- # y_true,
1027
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1028
- # )
1029
-
1030
- # y_pred_masked = tf.boolean_mask(
1031
- # y_pred,
1032
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1033
- # )
1034
-
1035
- return cca(
1036
- y_true,
1037
- y_pred,
1038
- sample_weight=sample_weight,
1039
- )
1040
-
1041
- return masked_categorical_accuracy
1042
-
1043
- @staticmethod
1044
- def make_masked_categorical_crossentropy():
1045
- """Make categorical crossentropy loss function with missing mask.
1046
-
1047
- Returns:
1048
- callable: Function that calculates categorical crossentropy loss.
1049
- """
1050
-
1051
- @tf.function
1052
- def masked_categorical_crossentropy(
1053
- y_true, y_pred, sample_weight=None
1054
- ):
1055
- """Custom loss function for neural network model with missing mask.
1056
- Ignores missing data in the calculation of the loss function.
1057
-
1058
- Args:
1059
- y_true (tensorflow.tensor): Input one-hot encoded 3D tensor.
1060
- y_pred (tensorflow.tensor): Predicted values.
1061
- sample_weight (numpy.ndarray): 2D matrix of sample weights.
1062
-
1063
- Returns:
1064
- float: Mean squared error loss value with missing data masked.
1065
- """
1066
- # Mask out missing values.
1067
- # y_true_masked = tf.boolean_mask(
1068
- # y_true,
1069
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1070
- # )
1071
-
1072
- # y_pred_masked = tf.boolean_mask(
1073
- # y_pred,
1074
- # tf.reduce_any(tf.not_equal(y_true, -1), axis=-1),
1075
- # )
1076
-
1077
- return cce(
1078
- y_true,
1079
- y_pred,
1080
- sample_weight=sample_weight,
1081
- )
1082
-
1083
- return masked_categorical_crossentropy
1084
-
1085
- @staticmethod
1086
- def kl_divergence(z_mean, z_log_var, kl_weight=0.5):
1087
- kl_loss = -0.5 * (
1088
- 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
1089
- )
1090
- return tf.reduce_mean(tf.reduce_sum(kl_loss, axis=-1))
1091
-
1092
- # Another way of doing it.
1093
- # TODO: Test both ways.
1094
- # z_sigma = tf.math.exp(0.5 * z_log_var)
1095
- # return tf.reduce_sum(
1096
- # tf.math.square(z_mean) + tf.math.square(z_sigma) - z_log_var - 1.0,
1097
- # axis=-1,
1098
- # )
1099
-
1100
- def make_reconstruction_loss(self):
1101
- """Make loss function for use with a keras model.
1102
-
1103
- Returns:
1104
- callable: Function that calculates loss.
1105
- """
1106
-
1107
- def reconstruction_loss(input_and_mask, y_pred):
1108
- """Custom loss function for neural network model with missing mask.
1109
-
1110
- Ignores missing data in the calculation of the loss function.
1111
-
1112
- Args:
1113
- input_and_mask (numpy.ndarray): Input one-hot encoded array with missing values also one-hot encoded and h-stacked.
1114
-
1115
- y_pred (numpy.ndarray): Predicted values.
1116
-
1117
- Returns:
1118
- float: Mean squared error loss value with missing data masked.
1119
- """
1120
- n_features = y_pred.numpy().shape[1]
1121
-
1122
- true_indices = range(n_features)
1123
- missing_indices = range(n_features, n_features * 2)
1124
-
1125
- # Split features and missing mask.
1126
- y_true = tf.gather(input_and_mask, true_indices, axis=1)
1127
- missing_mask = tf.gather(input_and_mask, missing_indices, axis=1)
1128
-
1129
- observed_mask = tf.subtract(1.0, missing_mask)
1130
- y_true_observed = tf.multiply(y_true, observed_mask)
1131
- pred_observed = tf.multiply(y_pred, observed_mask)
1132
-
1133
- # loss_fn = tf.keras.losses.CategoricalCrossentropy()
1134
- # return loss_fn(y_true_observed, pred_observed)
1135
-
1136
- return tf.keras.metrics.mean_squared_error(
1137
- y_true=y_true_observed, y_pred=pred_observed
1138
- )
1139
-
1140
- return reconstruction_loss
1141
-
1142
- @staticmethod
1143
- def normalize_data(data):
1144
- """Normalize data between 0 and 1."""
1145
- return (data - np.min(data)) / (np.max(data) - np.min(data))
1146
-
1147
- @staticmethod
1148
- def normalize_sum_to_1(d, target=1.0):
1149
- factor = target / sum(d.values())
1150
- return {k: v * factor for k, v in d.items()}
1151
-
1152
- @staticmethod
1153
- def smooth_weights(d, mu=0.15):
1154
- total = np.sum(list(d.values()))
1155
- keys = d.keys()
1156
- class_weight = dict()
1157
-
1158
- for k in keys:
1159
- score = math.log(mu * total / float(d[k]))
1160
- class_weight[k] = score if score > 1.0 else 1.0
1161
-
1162
- return class_weight
1163
-
1164
- @classmethod
1165
- def get_class_weights(
1166
- cls,
1167
- y_true,
1168
- original_missing_mask,
1169
- user_weights=None,
1170
- return_1d=False,
1171
- method="auto",
1172
- ):
1173
- """Get class weights for each column in a 2D matrix.
1174
-
1175
- Args:
1176
- y_true (numpy.ndarray): True target values.
1177
-
1178
- original_missing_mask (numpy.ndarray): Boolean mask with missing values set to True and non-missing to False.
1179
-
1180
- user_weights (Dict[int, float], optional): Class weights if user-provided.
1181
-
1182
- return_1d (bool, optional): If True, returns a dictionary of class weights, with integer encodings as keys and the corresponding class weights as keys. If False, returns 2D sample_weight matrix. Defaults to False.
1183
-
1184
- Returns:
1185
- numpy.ndarray or Dict[int, float]: Sample weights per column of shape (n_samples, n_features) if return_1d is False. Dictionary of class weights if True.
1186
- """
1187
- # Get list of class_weights (per-column).
1188
- class_weights = list()
1189
- sample_weight = np.zeros(y_true.shape)
1190
- if user_weights is not None:
1191
- # Set user-defined sample_weights
1192
- for k in user_weights.keys():
1193
- sample_weight[y_true == k] = user_weights[k]
1194
-
1195
- elif return_1d:
1196
- y_true_1d = y_true.flatten()
1197
-
1198
- if method == "auto":
1199
- sample_weight = dict(
1200
- zip(
1201
- np.unique(y_true_1d),
1202
- compute_class_weight(
1203
- "balanced",
1204
- classes=np.unique(y_true_1d),
1205
- y=y_true_1d,
1206
- ),
1207
- )
1208
- )
1209
-
1210
- elif method == "logsmooth":
1211
- counts = np.unique(y_true_1d, return_counts=True)
1212
- sample_weight = dict(zip(counts[0], counts[1]))
1213
- sample_weight.pop(-9)
1214
- sample_weight = cls.smooth_weights(sample_weight)
1215
- sample_weight[-9] = 0.0
1216
-
1217
- else:
1218
- # Automatically get class weights to set sample_weight.
1219
- for i in np.arange(y_true.shape[1]):
1220
- mm = ~original_missing_mask[:, i]
1221
- classes = np.unique(y_true[mm, i])
1222
- cw = compute_class_weight(
1223
- "balanced",
1224
- classes=classes,
1225
- y=y_true[mm, i],
1226
- )
1227
-
1228
- class_weights.append({k: v for k, v in zip(classes, cw)})
1229
-
1230
- # Make sample_weight_matrix from automatic per-column class_weights.
1231
- for i, w in enumerate(class_weights):
1232
- for j in range(3):
1233
- if j in w:
1234
- sample_weight[y_true[:, i] == j, i] = w[j]
1235
-
1236
- return sample_weight
1237
-
1238
- @staticmethod
1239
- def write_gt_state_probs(
1240
- y_pred,
1241
- y_pred_1d,
1242
- y_true,
1243
- y_true_1d,
1244
- nn_method,
1245
- sim_missing_mask,
1246
- original_missing_mask,
1247
- prefix="imputer",
1248
- ):
1249
- """
1250
- Process genotype state probabilities and generate a DataFrame.
1251
-
1252
- Args:
1253
- y_pred (numpy.ndarray): 2D array of predicted probabilities.
1254
- y_pred_1d (numpy.ndarray): 1D array of predicted classes.
1255
- y_true (numpy.ndarray): 2D array of true labels.
1256
- y_true_1d (numpy.ndarray): 1D array of true classes.
1257
- sim_missing_mask (numpy.ndarray): Mask for simulated missing values.
1258
- original_missing_mask (numpy.ndarray): Mask for original missing values.
1259
-
1260
- Returns:
1261
- pd.DataFrame: DataFrame containing processed data.
1262
- list: List of row colors for plotting.
1263
- """
1264
- bin_mapping = np.array(
1265
- [np.array2string(x) for row in y_pred for x in row]
1266
- )
1267
-
1268
- bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
1269
-
1270
- y_true_2d = np.reshape(y_true_1d, y_true.shape)
1271
-
1272
- bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
1273
-
1274
- y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
1275
-
1276
- include = np.logical_and(sim_missing_mask, ~original_missing_mask)
1277
-
1278
- gt_dist = list()
1279
-
1280
- colors = []
1281
-
1282
- for yt, yp, ypd, mask in zip(
1283
- y_true_2d, bin_mapping_2d, y_pred_2d, include
1284
- ):
1285
- sites = dict()
1286
-
1287
- row_colors = []
1288
-
1289
- for i, (yt_site, mask_site) in enumerate(zip(yt, mask)):
1290
- if mask_site:
1291
- sites[
1292
- f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1293
- ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1294
-
1295
- if ypd[i] == yt_site:
1296
- row_colors.append("blue")
1297
-
1298
- else:
1299
- row_colors.append("red")
1300
-
1301
- else:
1302
- sites[
1303
- f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1304
- ] = f"{i},{np.array2string(np.array([0.0, 0.0, 0.0]))},0,0"
1305
-
1306
- row_colors.append("gray")
1307
-
1308
- gt_dist.append(sites)
1309
-
1310
- colors.append(row_colors)
1311
-
1312
- gt_df = pd.DataFrame.from_records(gt_dist)
1313
-
1314
- # Create a DataFrame of binary values based on whether imputed and expected genotypes match
1315
- # imputedGT_data = gt_df.applymap(lambda x: int(x.split(",")[2]))
1316
- # expectedGT_data = gt_df.applymap(lambda x: int(x.split(",")[3]))
1317
- # mask = imputedGT_data == expectedGT_data
1318
-
1319
- outdir = os.path.join(
1320
- f"{prefix}_output", "plots", "Unsupervised", f"{nn_method}"
1321
- )
1322
-
1323
- Path(outdir).mkdir(parents=True, exist_ok=True)
1324
-
1325
- output_path = os.path.join(outdir, "simulated_heatmap.png")
1326
-
1327
- NeuralNetworkMethods.plot_heatmap(
1328
- gt_df,
1329
- colors,
1330
- "Expected Genotypes for Simulated Missing Values",
1331
- "Column Index",
1332
- "Row Index",
1333
- output_path,
1334
- )
1335
-
1336
- return gt_df, colors
1337
-
1338
- @staticmethod
1339
- def plot_heatmap(data, colors, title, xlabel, ylabel, output_path):
1340
- """
1341
- Plot heatmap based on the provided data and colors.
1342
-
1343
- Args:
1344
- data (pd.DataFrame): DataFrame containing binary mask data.
1345
- colors (list): List of row colors for plotting.
1346
- title (str): Title of the plot.
1347
- xlabel (str): Label for x-axis.
1348
- ylabel (str): Label for y-axis.
1349
- output_path (str): Path to save the output plot.
1350
-
1351
- Returns:
1352
- None
1353
- """
1354
- imputedGT_data = data.applymap(lambda x: int(x.split(",")[2]))
1355
-
1356
- expectedGT_data = data.applymap(lambda x: int(x.split(",")[3]))
1357
-
1358
- mask = imputedGT_data == expectedGT_data
1359
-
1360
- colors = []
1361
-
1362
- for i, row in mask.iterrows():
1363
- row_colors = []
1364
-
1365
- for j, val in enumerate(row):
1366
- if (
1367
- imputedGT_data.iloc[i, j] == 0
1368
- and expectedGT_data.iloc[i, j] == 0
1369
- ):
1370
- row_colors.append("gray")
1371
-
1372
- elif val:
1373
- row_colors.append("blue")
1374
-
1375
- else:
1376
- row_colors.append("red")
1377
-
1378
- colors.append(row_colors)
1379
-
1380
- return mask, colors
1381
-
1382
- # @staticmethod
1383
- # def write_gt_state_probs(
1384
- # y_pred,
1385
- # y_pred_1d,
1386
- # y_true,
1387
- # y_true_1d,
1388
- # nn_method,
1389
- # sim_missing_mask,
1390
- # original_missing_mask,
1391
- # prefix="imputer",
1392
- # ):
1393
- # bin_mapping = np.array(
1394
- # [np.array2string(x) for row in y_pred for x in row]
1395
- # )
1396
-
1397
- # bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
1398
-
1399
- # y_true_2d = np.reshape(y_true_1d, y_true.shape)
1400
- # bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
1401
- # y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
1402
-
1403
- # include = np.logical_and(sim_missing_mask, ~original_missing_mask)
1404
-
1405
- # gt_dist = list()
1406
- # colors = []
1407
- # for yt, yp, ypd, mask in zip(
1408
- # y_true_2d,
1409
- # bin_mapping_2d,
1410
- # y_pred_2d,
1411
- # include,
1412
- # ):
1413
- # sites = dict()
1414
- # row_colors = []
1415
- # for i, (yt_site, mask_site) in enumerate(zip(yt, mask)):
1416
- # if mask_site:
1417
- # sites[
1418
- # f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1419
- # ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1420
- # if ypd[i] == yt_site:
1421
- # row_colors.append("blue")
1422
- # else:
1423
- # sites[
1424
- # f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1425
- # ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1426
- # row_colors.append("orange")
1427
- # else:
1428
- # sites[
1429
- # f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1430
- # ] = f"{i},{np.array2string(np.array([0.0, 0.0, 0.0]))},0,0"
1431
- # row_colors.append("gray")
1432
- # gt_dist.append(sites)
1433
- # colors.append(row_colors)
1434
-
1435
- # gt_df = pd.DataFrame.from_records(gt_dist)
1436
- # gt_df.to_csv(
1437
- # os.path.join(
1438
- # f"{prefix}_output",
1439
- # "logs",
1440
- # "Unsupervised",
1441
- # nn_method,
1442
- # "genotype_state_proba.csv",
1443
- # ),
1444
- # index=False,
1445
- # header=False,
1446
- # )
1447
-
1448
- # # Reload the data
1449
-
1450
- # data = pd.read_csv(
1451
- # os.path.join(
1452
- # f"{prefix}_output",
1453
- # "logs",
1454
- # "Unsupervised",
1455
- # nn_method,
1456
- # "genotype_state_proba.csv",
1457
- # ),
1458
- # header=None,
1459
- # )
1460
-
1461
- # # Parse the original data into separate dataframes for imputedGT and expectedGT
1462
- # imputedGT_data = data.applymap(lambda x: int(x.split(",")[2]))
1463
- # expectedGT_data = data.applymap(lambda x: int(x.split(",")[3]))
1464
-
1465
- # # Determine the binary mask based on whether imputedGT and expectedGT are the same
1466
-
1467
- # mask = imputedGT_data == expectedGT_data
1468
-
1469
- # # Create a new figure and set its size
1470
- # plt.figure(figsize=(12, 6))
1471
-
1472
- # from matplotlib.colors import ListedColormap
1473
-
1474
- # rgb_colors = sns.color_palette(
1475
- # [color for sublist in colors for color in sublist]
1476
- # )
1477
- # cmap = ListedColormap(rgb_colors)
1478
-
1479
- # # Create a heatmap
1480
- # sns.heatmap(mask, cmap=cmap, cbar=False)
1481
-
1482
- # # Set the title and labels
1483
- # plt.title("Expected Genotypes for Simulated Genotypes")
1484
- # plt.xlabel("Column Index")
1485
- # plt.ylabel("Row Index")
1486
-
1487
- # # Create a custom legend
1488
- # import matplotlib.patches as mpatches
1489
-
1490
- # green_patch = mpatches.Patch(color="blue", label="Agreement")
1491
- # orange_patch = mpatches.Patch(color="orange", label="Disagreement")
1492
- # gray_patch = mpatches.Patch(color="gray", label="Not Simulated")
1493
-
1494
- # plt.legend(
1495
- # handles=[green_patch, orange_patch, gray_patch], loc="lower right"
1496
- # )
1497
-
1498
- # outfile = os.path.join(
1499
- # f"{prefix}_output",
1500
- # "plots",
1501
- # "Unsupervised",
1502
- # nn_method,
1503
- # "gt_state_proba.png",
1504
- # )
1505
-
1506
- # plt.savefig(outfile, bbox_inches="tight", facecolor="white")
1507
-
1508
- # @staticmethod
1509
- # def write_gt_state_probs(
1510
- # y_pred,
1511
- # y_pred_1d,
1512
- # y_true,
1513
- # y_true_1d,
1514
- # nn_method,
1515
- # sim_missing_mask,
1516
- # original_missing_mask,
1517
- # prefix="imputer",
1518
- # ):
1519
- # bin_mapping = np.array(
1520
- # [np.array2string(x) for row in y_pred for x in row]
1521
- # )
1522
-
1523
- # bin_mapping = np.reshape(bin_mapping, y_pred_1d.shape)
1524
-
1525
- # y_true_2d = np.reshape(y_true_1d, y_true.shape)
1526
- # bin_mapping_2d = np.reshape(bin_mapping, y_true.shape)
1527
- # y_pred_2d = np.reshape(y_pred_1d, y_true.shape)
1528
-
1529
- # gt_dist = list()
1530
- # for yt, yp, ypd in zip(y_true_2d, bin_mapping_2d, y_pred_2d):
1531
- # sites = dict()
1532
- # for i, yt_site in enumerate(yt):
1533
- # sites[
1534
- # f"Site Index {i},Probability Vector,Imputed Genotype,Expected Genotype"
1535
- # ] = f"{i},{yp[i]},{ypd[i]},{yt_site}"
1536
- # gt_dist.append(sites)
1537
-
1538
- # gt_df = pd.DataFrame.from_records(gt_dist)
1539
- # gt_df.to_csv(
1540
- # os.path.join(
1541
- # f"{prefix}_output",
1542
- # "logs",
1543
- # "Unsupervised",
1544
- # nn_method,
1545
- # "genotype_state_proba.csv",
1546
- # ),
1547
- # index=False,
1548
- # header=False,
1549
- # )