pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pg-sui might be problematic. Click here for more details.
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
- pg_sui-1.6.8.dist-info/RECORD +78 -0
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
- pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
- pg_sui-1.6.8.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +635 -0
- pgsui/data_processing/config.py +576 -0
- pgsui/data_processing/containers.py +1782 -0
- pgsui/data_processing/transformers.py +121 -1103
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +189 -0
- pgsui/electron/app/package-lock.json +6893 -0
- pgsui/electron/app/package.json +50 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +146 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +130 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +59 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
- pgsui/impute/deterministic/imputers/mode.py +679 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +971 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
- pgsui/impute/supervised/base.py +339 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
- pgsui/impute/supervised/imputers/random_forest.py +287 -0
- pgsui/impute/unsupervised/base.py +924 -0
- pgsui/impute/unsupervised/callbacks.py +89 -263
- pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
- pgsui/impute/unsupervised/imputers/vae.py +957 -0
- pgsui/impute/unsupervised/loss_functions.py +158 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
- pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
- pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
- pgsui/impute/unsupervised/models/vae_model.py +259 -618
- pgsui/impute/unsupervised/nn_scorers.py +215 -0
- pgsui/utils/classification_viz.py +591 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +514 -824
- pgsui/utils/scorers.py +212 -438
- pg_sui-1.0.2.1.dist-info/RECORD +0 -75
- pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -735
- pgsui/impute/impute.py +0 -1486
- pgsui/impute/simple_imputers.py +0 -1439
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
- pgsui/impute/unsupervised/keras_classifiers.py +0 -702
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -297
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -214
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
- /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
|
@@ -1,634 +1,284 @@
|
|
|
1
|
-
import
|
|
2
|
-
import os
|
|
3
|
-
import sys
|
|
4
|
-
import warnings
|
|
1
|
+
from typing import List, Literal
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
3
|
+
import numpy as np
|
|
4
|
+
import torch
|
|
5
|
+
import torch.nn as nn
|
|
6
|
+
from snpio.utils.logging import LoggerManager
|
|
10
7
|
|
|
11
|
-
|
|
8
|
+
from pgsui.impute.unsupervised.loss_functions import MaskedFocalLoss
|
|
12
9
|
|
|
13
|
-
# Disable can't find cuda .dll errors. Also turns of GPU support.
|
|
14
|
-
tf.config.set_visible_devices([], "GPU")
|
|
15
10
|
|
|
16
|
-
|
|
11
|
+
class Encoder(nn.Module):
|
|
12
|
+
"""The Encoder module of a standard Autoencoder.
|
|
17
13
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
tf.get_logger().setLevel(logging.ERROR)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# Monkey patching deprecation utils to supress warnings.
|
|
24
|
-
# noinspection PyUnusedLocal
|
|
25
|
-
def deprecated(
|
|
26
|
-
date, instructions, warn_once=True
|
|
27
|
-
): # pylint: disable=unused-argument
|
|
28
|
-
def deprecated_wrapper(func):
|
|
29
|
-
return func
|
|
30
|
-
|
|
31
|
-
return deprecated_wrapper
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
deprecation.deprecated = deprecated
|
|
35
|
-
|
|
36
|
-
from tensorflow.keras.layers import (
|
|
37
|
-
Dropout,
|
|
38
|
-
Dense,
|
|
39
|
-
Reshape,
|
|
40
|
-
Flatten,
|
|
41
|
-
LeakyReLU,
|
|
42
|
-
PReLU,
|
|
43
|
-
Activation,
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
from tensorflow.keras.regularizers import l1_l2
|
|
47
|
-
from tensorflow.keras import backend as K
|
|
48
|
-
|
|
49
|
-
# Custom Modules
|
|
50
|
-
try:
|
|
51
|
-
from ..neural_network_methods import NeuralNetworkMethods
|
|
52
|
-
except (ModuleNotFoundError, ValueError, ImportError):
|
|
53
|
-
from impute.unsupervised.neural_network_methods import NeuralNetworkMethods
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class Encoder(tf.keras.layers.Layer):
|
|
57
|
-
"""VAE encoder to Encode genotypes to (z_mean, z_log_var, z)."""
|
|
14
|
+
This module defines the encoder network, which takes high-dimensional input data and maps it to a deterministic, low-dimensional latent representation. The architecture consists of a series of fully-connected hidden layers that progressively compress the flattened input data into a single latent vector, `z`.
|
|
15
|
+
"""
|
|
58
16
|
|
|
59
17
|
def __init__(
|
|
60
18
|
self,
|
|
61
|
-
n_features,
|
|
62
|
-
num_classes,
|
|
63
|
-
latent_dim,
|
|
64
|
-
hidden_layer_sizes,
|
|
65
|
-
dropout_rate,
|
|
66
|
-
activation,
|
|
67
|
-
kernel_initializer,
|
|
68
|
-
kernel_regularizer,
|
|
69
|
-
beta=K.variable(0.0),
|
|
70
|
-
name="Encoder",
|
|
71
|
-
**kwargs,
|
|
19
|
+
n_features: int,
|
|
20
|
+
num_classes: int,
|
|
21
|
+
latent_dim: int,
|
|
22
|
+
hidden_layer_sizes: List[int],
|
|
23
|
+
dropout_rate: float,
|
|
24
|
+
activation: torch.nn.Module,
|
|
72
25
|
):
|
|
73
|
-
|
|
26
|
+
"""Initializes the Encoder module.
|
|
74
27
|
|
|
75
|
-
|
|
28
|
+
This class defines the encoder network, which takes high-dimensional input data and maps it to a deterministic, low-dimensional latent representation. The architecture consists of a series of fully-connected hidden layers that progressively compress the flattened input data into a single latent vector, `z`.
|
|
76
29
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
30
|
+
Args:
|
|
31
|
+
n_features (int): The number of features in the input data (e.g., SNPs).
|
|
32
|
+
num_classes (int): The number of possible classes for each input element (e.g., 4 alleles).
|
|
33
|
+
latent_dim (int): The dimensionality of the output latent space.
|
|
34
|
+
hidden_layer_sizes (List[int]): A list of integers specifying the size of each hidden layer.
|
|
35
|
+
dropout_rate (float): The dropout rate for regularization in the hidden layers.
|
|
36
|
+
activation (torch.nn.Module): An instantiated activation function module (e.g., `nn.ReLU()`) for the hidden layers.
|
|
37
|
+
"""
|
|
38
|
+
super(Encoder, self).__init__()
|
|
39
|
+
self.flatten = nn.Flatten()
|
|
81
40
|
|
|
82
|
-
|
|
83
|
-
|
|
41
|
+
layers = []
|
|
42
|
+
input_dim = n_features * num_classes
|
|
43
|
+
for hidden_size in hidden_layer_sizes:
|
|
44
|
+
layers.append(nn.Linear(input_dim, hidden_size))
|
|
45
|
+
layers.append(nn.BatchNorm1d(hidden_size))
|
|
46
|
+
layers.append(nn.Dropout(dropout_rate))
|
|
47
|
+
layers.append(activation)
|
|
48
|
+
input_dim = hidden_size
|
|
84
49
|
|
|
85
|
-
self.
|
|
86
|
-
|
|
87
|
-
input_shape=(n_features * num_classes,),
|
|
88
|
-
activation=activation,
|
|
89
|
-
kernel_initializer=kernel_initializer,
|
|
90
|
-
kernel_regularizer=kernel_regularizer,
|
|
91
|
-
name="Encoder1",
|
|
92
|
-
)
|
|
50
|
+
self.hidden_layers = nn.Sequential(*layers)
|
|
51
|
+
self.dense_z = nn.Linear(input_dim, latent_dim)
|
|
93
52
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
hidden_layer_sizes[1],
|
|
97
|
-
activation=activation,
|
|
98
|
-
kernel_initializer=kernel_initializer,
|
|
99
|
-
kernel_regularizer=kernel_regularizer,
|
|
100
|
-
name="Encoder2",
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
if len(hidden_layer_sizes) >= 3:
|
|
104
|
-
self.dense3 = Dense(
|
|
105
|
-
hidden_layer_sizes[2],
|
|
106
|
-
activation=activation,
|
|
107
|
-
kernel_initializer=kernel_initializer,
|
|
108
|
-
kernel_regularizer=kernel_regularizer,
|
|
109
|
-
name="Encoder3",
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
if len(hidden_layer_sizes) >= 4:
|
|
113
|
-
self.dense4 = Dense(
|
|
114
|
-
hidden_layer_sizes[3],
|
|
115
|
-
activation=activation,
|
|
116
|
-
kernel_initializer=kernel_initializer,
|
|
117
|
-
kernel_regularizer=kernel_regularizer,
|
|
118
|
-
name="Encoder4",
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
if len(hidden_layer_sizes) == 5:
|
|
122
|
-
self.dense5 = Dense(
|
|
123
|
-
hidden_layer_sizes[4],
|
|
124
|
-
activation=activation,
|
|
125
|
-
kernel_initializer=kernel_initializer,
|
|
126
|
-
kernel_regularizer=kernel_regularizer,
|
|
127
|
-
name="Encoder5",
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
self.dense_latent = Dense(
|
|
131
|
-
latent_dim,
|
|
132
|
-
activation=activation,
|
|
133
|
-
kernel_initializer=kernel_initializer,
|
|
134
|
-
kernel_regularizer=kernel_regularizer,
|
|
135
|
-
name="Encoder5",
|
|
136
|
-
)
|
|
53
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
54
|
+
"""Performs the forward pass through the encoder.
|
|
137
55
|
|
|
138
|
-
|
|
56
|
+
Args:
|
|
57
|
+
x (torch.Tensor): The input data tensor of shape `(batch_size, n_features, num_classes)`.
|
|
139
58
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
x = self.
|
|
144
|
-
x = self.
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
x = self.dropout_layer(x, training=training)
|
|
148
|
-
if self.dense3 is not None:
|
|
149
|
-
x = self.dense3(x)
|
|
150
|
-
x = self.dropout_layer(x, training=training)
|
|
151
|
-
if self.dense4 is not None:
|
|
152
|
-
x = self.dense4(x)
|
|
153
|
-
x = self.dropout_layer(x, training=training)
|
|
154
|
-
if self.dense5 is not None:
|
|
155
|
-
x = self.dense5(x)
|
|
156
|
-
x = self.dropout_layer(x, training=training)
|
|
59
|
+
Returns:
|
|
60
|
+
torch.Tensor: The latent representation `z` of shape `(batch_size, latent_dim)`.
|
|
61
|
+
"""
|
|
62
|
+
x = self.flatten(x)
|
|
63
|
+
x = self.hidden_layers(x)
|
|
64
|
+
z = self.dense_z(x)
|
|
65
|
+
return z
|
|
157
66
|
|
|
158
|
-
return self.dense_latent(x)
|
|
159
67
|
|
|
68
|
+
class Decoder(nn.Module):
|
|
69
|
+
"""The Decoder module of a standard Autoencoder.
|
|
160
70
|
|
|
161
|
-
|
|
162
|
-
"""
|
|
71
|
+
This module defines the decoder network, which takes a deterministic latent vector and maps it back to the high-dimensional data space, aiming to reconstruct the original input. The architecture typically mirrors the encoder, consisting of a series of fully-connected hidden layers that progressively expand the representation, followed by a final linear layer to produce the reconstructed data.
|
|
72
|
+
"""
|
|
163
73
|
|
|
164
74
|
def __init__(
|
|
165
75
|
self,
|
|
166
|
-
n_features,
|
|
167
|
-
num_classes,
|
|
168
|
-
latent_dim,
|
|
169
|
-
hidden_layer_sizes,
|
|
170
|
-
dropout_rate,
|
|
171
|
-
activation,
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
name="Decoder",
|
|
175
|
-
**kwargs,
|
|
176
|
-
):
|
|
177
|
-
super(Decoder, self).__init__(name=name, **kwargs)
|
|
178
|
-
|
|
179
|
-
self.dense2 = None
|
|
180
|
-
self.dense3 = None
|
|
181
|
-
self.dense4 = None
|
|
182
|
-
self.dense5 = None
|
|
183
|
-
|
|
184
|
-
self.dense1 = Dense(
|
|
185
|
-
hidden_layer_sizes[0],
|
|
186
|
-
input_shape=(latent_dim,),
|
|
187
|
-
activation=activation,
|
|
188
|
-
kernel_initializer=kernel_initializer,
|
|
189
|
-
kernel_regularizer=kernel_regularizer,
|
|
190
|
-
name="Decoder1",
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
if len(hidden_layer_sizes) >= 2:
|
|
194
|
-
self.dense2 = Dense(
|
|
195
|
-
hidden_layer_sizes[1],
|
|
196
|
-
activation=activation,
|
|
197
|
-
kernel_initializer=kernel_initializer,
|
|
198
|
-
kernel_regularizer=kernel_regularizer,
|
|
199
|
-
name="Decoder2",
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
if len(hidden_layer_sizes) >= 3:
|
|
203
|
-
self.dense3 = Dense(
|
|
204
|
-
hidden_layer_sizes[2],
|
|
205
|
-
activation=activation,
|
|
206
|
-
kernel_initializer=kernel_initializer,
|
|
207
|
-
kernel_regularizer=kernel_regularizer,
|
|
208
|
-
name="Decoder3",
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
if len(hidden_layer_sizes) >= 4:
|
|
212
|
-
self.dense4 = Dense(
|
|
213
|
-
hidden_layer_sizes[3],
|
|
214
|
-
activation=activation,
|
|
215
|
-
kernel_initializer=kernel_initializer,
|
|
216
|
-
kernel_regularizer=kernel_regularizer,
|
|
217
|
-
name="Decoder4",
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
if len(hidden_layer_sizes) == 5:
|
|
221
|
-
self.dense5 = Dense(
|
|
222
|
-
hidden_layer_sizes[4],
|
|
223
|
-
activation=activation,
|
|
224
|
-
kernel_initializer=kernel_initializer,
|
|
225
|
-
kernel_regularizer=kernel_regularizer,
|
|
226
|
-
name="Decoder5",
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
# No activation for final layer.
|
|
230
|
-
self.dense_output = Dense(
|
|
231
|
-
n_features * num_classes,
|
|
232
|
-
kernel_initializer=kernel_initializer,
|
|
233
|
-
kernel_regularizer=kernel_regularizer,
|
|
234
|
-
activation=None,
|
|
235
|
-
name="Decoder6",
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
self.rshp = Reshape((n_features, num_classes))
|
|
239
|
-
self.dropout_layer = Dropout(dropout_rate)
|
|
240
|
-
|
|
241
|
-
def call(self, inputs, training=None):
|
|
242
|
-
"""Forward pass through model."""
|
|
243
|
-
x = self.dense1(inputs)
|
|
244
|
-
x = self.dropout_layer(x, training=training)
|
|
245
|
-
if self.dense2 is not None:
|
|
246
|
-
x = self.dense2(x)
|
|
247
|
-
x = self.dropout_layer(x, training=training)
|
|
248
|
-
if self.dense3 is not None:
|
|
249
|
-
x = self.dense3(x)
|
|
250
|
-
x = self.dropout_layer(x, training=training)
|
|
251
|
-
if self.dense4 is not None:
|
|
252
|
-
x = self.dense4(x)
|
|
253
|
-
x = self.dropout_layer(x, training=training)
|
|
254
|
-
if self.dense5 is not None:
|
|
255
|
-
x = self.dense5(x)
|
|
256
|
-
x = self.dropout_layer(x, training=training)
|
|
76
|
+
n_features: int,
|
|
77
|
+
num_classes: int,
|
|
78
|
+
latent_dim: int,
|
|
79
|
+
hidden_layer_sizes: List[int],
|
|
80
|
+
dropout_rate: float,
|
|
81
|
+
activation: torch.nn.Module,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Initializes the Decoder module.
|
|
257
84
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
batch_size (int, optional): Batch size to use with model. Defaults to 32.
|
|
269
|
-
|
|
270
|
-
output_shape (int, optional): Number of features in output. Defaults to None.
|
|
85
|
+
Args:
|
|
86
|
+
n_features (int): The number of features in the output data (e.g., SNPs).
|
|
87
|
+
num_classes (int): The number of possible classes for each output element (e.g., 4 alleles).
|
|
88
|
+
latent_dim (int): The dimensionality of the input latent space.
|
|
89
|
+
hidden_layer_sizes (List[int]): A list of integers specifying the size of each hidden layer (typically the reverse of the encoder's).
|
|
90
|
+
dropout_rate (float): The dropout rate for regularization in the hidden layers.
|
|
91
|
+
activation (torch.nn.Module): An instantiated activation function module (e.g., `nn.ReLU()`) for the hidden layers.
|
|
92
|
+
"""
|
|
93
|
+
super(Decoder, self).__init__()
|
|
271
94
|
|
|
272
|
-
|
|
95
|
+
layers = []
|
|
96
|
+
input_dim = latent_dim
|
|
97
|
+
for hidden_size in hidden_layer_sizes:
|
|
98
|
+
layers.append(nn.Linear(input_dim, hidden_size))
|
|
99
|
+
layers.append(nn.BatchNorm1d(hidden_size))
|
|
100
|
+
layers.append(nn.Dropout(dropout_rate))
|
|
101
|
+
layers.append(activation)
|
|
102
|
+
input_dim = hidden_size
|
|
273
103
|
|
|
274
|
-
|
|
104
|
+
self.hidden_layers = nn.Sequential(*layers)
|
|
105
|
+
output_dim = n_features * num_classes
|
|
106
|
+
self.dense_output = nn.Linear(input_dim, output_dim)
|
|
107
|
+
self.reshape = (n_features, num_classes)
|
|
275
108
|
|
|
276
|
-
|
|
109
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
110
|
+
"""Performs the forward pass through the decoder.
|
|
277
111
|
|
|
278
|
-
|
|
112
|
+
Args:
|
|
113
|
+
x (torch.Tensor): The input latent tensor of shape `(batch_size, latent_dim)`.
|
|
279
114
|
|
|
280
|
-
|
|
115
|
+
Returns:
|
|
116
|
+
torch.Tensor: The reconstructed output data of shape `(batch_size, n_features, num_classes)`.
|
|
117
|
+
"""
|
|
118
|
+
x = self.hidden_layers(x)
|
|
119
|
+
x = self.dense_output(x)
|
|
120
|
+
return x.view(-1, *self.reshape)
|
|
281
121
|
|
|
282
|
-
l1_penalty (float, optional): l1_penalty to use for regularization. Defaults to 1e-6.
|
|
283
122
|
|
|
284
|
-
|
|
123
|
+
class AutoencoderModel(nn.Module):
|
|
124
|
+
"""A standard Autoencoder (AE) model for imputation.
|
|
285
125
|
|
|
286
|
-
|
|
126
|
+
This class combines an `Encoder` and a `Decoder` to form a standard autoencoder. The model is trained to learn a compressed, low-dimensional representation of the input data and then reconstruct it as accurately as possible. It is particularly useful for unsupervised dimensionality reduction and data imputation.
|
|
287
127
|
|
|
288
|
-
|
|
128
|
+
**Model Architecture and Objective:**
|
|
289
129
|
|
|
290
|
-
|
|
130
|
+
The autoencoder consists of two parts: an encoder, $f_{\theta}$, and a decoder, $g_{\phi}$.
|
|
131
|
+
1. The **encoder** maps the input data $x$ to a latent representation $z$:
|
|
132
|
+
$$
|
|
133
|
+
z = f_{\theta}(x)
|
|
134
|
+
$$
|
|
135
|
+
2. The **decoder** reconstructs the data $\hat{x}$ from the latent representation:
|
|
136
|
+
$$
|
|
137
|
+
\hat{x} = g_{\phi}(z)
|
|
138
|
+
$$
|
|
291
139
|
|
|
292
|
-
|
|
293
|
-
ValueError: Maximum number of hidden layers (5) was exceeded.
|
|
140
|
+
The model is trained by minimizing a reconstruction loss, $L(x, \hat{x})$, which measures the dissimilarity between the original input and the reconstructed output. This implementation uses a `MaskedFocalLoss` to handle missing values and class imbalance effectively.
|
|
294
141
|
"""
|
|
295
142
|
|
|
296
143
|
def __init__(
|
|
297
144
|
self,
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
missing_mask=None,
|
|
311
|
-
num_classes=3,
|
|
145
|
+
n_features: int,
|
|
146
|
+
prefix: str,
|
|
147
|
+
*,
|
|
148
|
+
num_classes: int = 4,
|
|
149
|
+
hidden_layer_sizes: List[int] | np.ndarray = [128, 64],
|
|
150
|
+
latent_dim: int = 2,
|
|
151
|
+
dropout_rate: float = 0.2,
|
|
152
|
+
activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu",
|
|
153
|
+
gamma: float = 2.0,
|
|
154
|
+
device: Literal["cpu", "gpu", "mps"] = "cpu",
|
|
155
|
+
verbose: bool = False,
|
|
156
|
+
debug: bool = False,
|
|
312
157
|
):
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
|
|
316
|
-
self.binary_accuracy_tracker = tf.keras.metrics.Mean(
|
|
317
|
-
name="binary_accuracy"
|
|
318
|
-
)
|
|
158
|
+
"""Initializes the AutoencoderModel.
|
|
319
159
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
self.
|
|
334
|
-
self.hidden_layer_sizes = hidden_layer_sizes
|
|
335
|
-
self.num_hidden_layers = num_hidden_layers
|
|
336
|
-
self.hidden_activation = hidden_activation
|
|
337
|
-
self.l1_penalty = l1_penalty
|
|
338
|
-
self.l2_penalty = l2_penalty
|
|
339
|
-
self.dropout_rate = dropout_rate
|
|
340
|
-
self.sample_weight = sample_weight
|
|
160
|
+
Args:
|
|
161
|
+
n_features (int): The number of features in the input data (e.g., SNPs).
|
|
162
|
+
prefix (str): A prefix used for logging.
|
|
163
|
+
num_classes (int): The number of possible classes for each input element. Defaults to 4.
|
|
164
|
+
hidden_layer_sizes (List[int] | np.ndarray): A list of integers specifying the size of each hidden layer in the encoder. The decoder will use the reverse of this structure. Defaults to [128, 64].
|
|
165
|
+
latent_dim (int): The dimensionality of the latent space (bottleneck). Defaults to 2.
|
|
166
|
+
dropout_rate (float): The dropout rate for regularization in hidden layers. Defaults to 0.2.
|
|
167
|
+
activation (Literal["relu", "elu", "selu", "leaky_relu"]): The name of the activation function for hidden layers. Defaults to "relu".
|
|
168
|
+
gamma (float): The focusing parameter for the focal loss function. Defaults to 2.0.
|
|
169
|
+
device (Literal["cpu", "gpu", "mps"]): The device to run the model on.
|
|
170
|
+
verbose (bool): If True, enables detailed logging.
|
|
171
|
+
debug (bool): If True, enables debug mode.
|
|
172
|
+
"""
|
|
173
|
+
super(AutoencoderModel, self).__init__()
|
|
341
174
|
self.num_classes = num_classes
|
|
175
|
+
self.gamma = gamma
|
|
176
|
+
self.device = device
|
|
342
177
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
hidden_layer_sizes = nn.validate_hidden_layers(
|
|
346
|
-
self.hidden_layer_sizes, self.num_hidden_layers
|
|
178
|
+
logman = LoggerManager(
|
|
179
|
+
name=__name__, prefix=prefix, verbose=verbose, debug=debug
|
|
347
180
|
)
|
|
181
|
+
self.logger = logman.get_logger()
|
|
348
182
|
|
|
349
|
-
|
|
350
|
-
n_features, self.n_components, hidden_layer_sizes, vae=True
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
hidden_layer_sizes = [h * self.num_classes for h in hidden_layer_sizes]
|
|
354
|
-
|
|
355
|
-
if self.l1_penalty == 0.0 and self.l2_penalty == 0.0:
|
|
356
|
-
kernel_regularizer = None
|
|
357
|
-
else:
|
|
358
|
-
kernel_regularizer = l1_l2(self.l1_penalty, self.l2_penalty)
|
|
359
|
-
|
|
360
|
-
kernel_initializer = self.weights_initializer
|
|
361
|
-
|
|
362
|
-
if self.hidden_activation.lower() == "leaky_relu":
|
|
363
|
-
activation = LeakyReLU(alpha=0.01)
|
|
364
|
-
|
|
365
|
-
elif self.hidden_activation.lower() == "prelu":
|
|
366
|
-
activation = PReLU()
|
|
367
|
-
|
|
368
|
-
elif self.hidden_activation.lower() == "selu":
|
|
369
|
-
activation = "selu"
|
|
370
|
-
kernel_initializer = "lecun_normal"
|
|
371
|
-
|
|
372
|
-
else:
|
|
373
|
-
activation = self.hidden_activation
|
|
374
|
-
|
|
375
|
-
if num_hidden_layers > 5:
|
|
376
|
-
raise ValueError(
|
|
377
|
-
f"The maximum number of hidden layers is 5, but got "
|
|
378
|
-
f"{num_hidden_layers}"
|
|
379
|
-
)
|
|
183
|
+
activation_module = self._resolve_activation(activation)
|
|
380
184
|
|
|
381
185
|
self.encoder = Encoder(
|
|
382
186
|
n_features,
|
|
383
187
|
self.num_classes,
|
|
384
|
-
|
|
188
|
+
latent_dim,
|
|
385
189
|
hidden_layer_sizes,
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
kernel_initializer,
|
|
389
|
-
kernel_regularizer,
|
|
190
|
+
dropout_rate,
|
|
191
|
+
activation_module,
|
|
390
192
|
)
|
|
391
193
|
|
|
392
|
-
hidden_layer_sizes
|
|
393
|
-
|
|
194
|
+
decoder_layer_sizes = list(reversed(hidden_layer_sizes))
|
|
394
195
|
self.decoder = Decoder(
|
|
395
196
|
n_features,
|
|
396
197
|
self.num_classes,
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
kernel_initializer,
|
|
402
|
-
kernel_regularizer,
|
|
403
|
-
)
|
|
404
|
-
|
|
405
|
-
self.activation = Activation("sigmoid")
|
|
406
|
-
|
|
407
|
-
def call(self, inputs, training=None):
|
|
408
|
-
"""Forward pass through model."""
|
|
409
|
-
x = self.encoder(inputs)
|
|
410
|
-
x = self.decoder(x)
|
|
411
|
-
return self.activation(x)
|
|
412
|
-
|
|
413
|
-
def model(self):
|
|
414
|
-
"""To allow model.summary().summar() to be called."""
|
|
415
|
-
x = tf.keras.Input(shape=(self.n_features, self.num_classes))
|
|
416
|
-
return tf.keras.Model(inputs=[x], outputs=self.call(x))
|
|
417
|
-
|
|
418
|
-
def set_model_outputs(self):
|
|
419
|
-
"""Set expected model outputs."""
|
|
420
|
-
x = tf.keras.Input(shape=(self.n_features, self.num_classes))
|
|
421
|
-
model = tf.keras.Model(inputs=[x], outputs=self.call(x))
|
|
422
|
-
self.outputs = model.outputs
|
|
423
|
-
|
|
424
|
-
@property
|
|
425
|
-
def metrics(self):
|
|
426
|
-
return [
|
|
427
|
-
self.total_loss_tracker,
|
|
428
|
-
self.binary_accuracy_tracker,
|
|
429
|
-
]
|
|
430
|
-
|
|
431
|
-
@tf.function
|
|
432
|
-
def train_step(self, data):
|
|
433
|
-
y = self._y
|
|
434
|
-
|
|
435
|
-
(
|
|
436
|
-
y_true,
|
|
437
|
-
sample_weight,
|
|
438
|
-
missing_mask,
|
|
439
|
-
) = self.nn_.prepare_training_batches(
|
|
440
|
-
y,
|
|
441
|
-
y,
|
|
442
|
-
self._batch_size,
|
|
443
|
-
self._batch_idx,
|
|
444
|
-
True,
|
|
445
|
-
self.n_components,
|
|
446
|
-
self._sample_weight,
|
|
447
|
-
self._missing_mask,
|
|
448
|
-
ubp=False,
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
if sample_weight is not None:
|
|
452
|
-
sample_weight_masked = tf.convert_to_tensor(
|
|
453
|
-
sample_weight[~missing_mask], dtype=tf.float32
|
|
454
|
-
)
|
|
455
|
-
else:
|
|
456
|
-
sample_weight_masked = None
|
|
457
|
-
|
|
458
|
-
y_true_masked = tf.boolean_mask(
|
|
459
|
-
tf.convert_to_tensor(y_true, dtype=tf.float32),
|
|
460
|
-
tf.reduce_any(tf.not_equal(y_true, -1), axis=2),
|
|
461
|
-
)
|
|
462
|
-
|
|
463
|
-
with tf.GradientTape() as tape:
|
|
464
|
-
reconstruction = self(y_true, training=True)
|
|
465
|
-
|
|
466
|
-
y_pred_masked = tf.boolean_mask(
|
|
467
|
-
reconstruction, tf.reduce_any(tf.not_equal(y_true, -1), axis=2)
|
|
468
|
-
)
|
|
469
|
-
|
|
470
|
-
# Returns binary crossentropy loss.
|
|
471
|
-
reconstruction_loss = self.compiled_loss(
|
|
472
|
-
y_true_masked,
|
|
473
|
-
y_pred_masked,
|
|
474
|
-
sample_weight=sample_weight_masked,
|
|
475
|
-
)
|
|
476
|
-
|
|
477
|
-
regularization_loss = sum(self.losses)
|
|
478
|
-
|
|
479
|
-
total_loss = reconstruction_loss + regularization_loss
|
|
480
|
-
|
|
481
|
-
grads = tape.gradient(total_loss, self.trainable_weights)
|
|
482
|
-
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
|
|
483
|
-
|
|
484
|
-
### NOTE: If you get the error, "'tuple' object has no attribute
|
|
485
|
-
### 'rank', then convert y_true to a tensor object."
|
|
486
|
-
self.total_loss_tracker.update_state(total_loss)
|
|
487
|
-
self.binary_accuracy_tracker.update_state(
|
|
488
|
-
tf.keras.metrics.binary_accuracy(y_true_masked, y_pred_masked)
|
|
198
|
+
latent_dim,
|
|
199
|
+
decoder_layer_sizes,
|
|
200
|
+
dropout_rate,
|
|
201
|
+
activation_module,
|
|
489
202
|
)
|
|
490
203
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
"binary_accuracy": self.binary_accuracy_tracker.result(),
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
@tf.function
|
|
497
|
-
def test_step(self, data):
|
|
498
|
-
"""Custom evaluation loop for one step (=batch) in a single epoch.
|
|
499
|
-
|
|
500
|
-
This function will evaluate on a batch of samples (rows), which can be adjusted with the ``batch_size`` parameter from the estimator.
|
|
204
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
205
|
+
"""Performs the forward pass through the full Autoencoder model.
|
|
501
206
|
|
|
502
207
|
Args:
|
|
503
|
-
|
|
208
|
+
x (torch.Tensor): The input data tensor of shape `(batch_size, n_features, num_classes)`.
|
|
504
209
|
|
|
505
210
|
Returns:
|
|
506
|
-
|
|
211
|
+
torch.Tensor: The reconstructed data tensor.
|
|
507
212
|
"""
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
y_true,
|
|
512
|
-
sample_weight,
|
|
513
|
-
missing_mask,
|
|
514
|
-
) = self.nn_.prepare_training_batches(
|
|
515
|
-
y,
|
|
516
|
-
y,
|
|
517
|
-
self._batch_size,
|
|
518
|
-
self._batch_idx,
|
|
519
|
-
True,
|
|
520
|
-
self.n_components,
|
|
521
|
-
self._sample_weight,
|
|
522
|
-
self._missing_mask,
|
|
523
|
-
ubp=False,
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
if sample_weight is not None:
|
|
527
|
-
sample_weight_masked = tf.convert_to_tensor(
|
|
528
|
-
sample_weight[~missing_mask], dtype=tf.float32
|
|
529
|
-
)
|
|
530
|
-
else:
|
|
531
|
-
sample_weight_masked = None
|
|
213
|
+
z = self.encoder(x)
|
|
214
|
+
reconstruction = self.decoder(z)
|
|
215
|
+
return reconstruction
|
|
532
216
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
reconstruction, tf.reduce_any(tf.not_equal(y_true, -1), axis=2)
|
|
542
|
-
)
|
|
543
|
-
|
|
544
|
-
reconstruction_loss = self.compiled_loss(
|
|
545
|
-
y_true_masked,
|
|
546
|
-
y_pred_masked,
|
|
547
|
-
sample_weight=sample_weight_masked,
|
|
548
|
-
)
|
|
549
|
-
|
|
550
|
-
regularization_loss = sum(self.losses)
|
|
551
|
-
|
|
552
|
-
total_loss = reconstruction_loss + regularization_loss
|
|
217
|
+
def compute_loss(
|
|
218
|
+
self,
|
|
219
|
+
reconstruction: torch.Tensor,
|
|
220
|
+
y: torch.Tensor,
|
|
221
|
+
mask: torch.Tensor | None = None,
|
|
222
|
+
class_weights: torch.Tensor | None = None,
|
|
223
|
+
) -> torch.Tensor:
|
|
224
|
+
"""Computes the reconstruction loss for the Autoencoder model.
|
|
553
225
|
|
|
554
|
-
|
|
555
|
-
### 'rank', then convert y_true to a tensor object."
|
|
556
|
-
self.total_loss_tracker.update_state(total_loss)
|
|
557
|
-
self.binary_accuracy_tracker.update_state(
|
|
558
|
-
tf.keras.metrics.binary_accuracy(y_true_masked, y_pred_masked)
|
|
559
|
-
)
|
|
226
|
+
This method calculates the reconstruction loss using a masked focal loss, which is suitable for categorical data with missing values and class imbalance.
|
|
560
227
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
228
|
+
Args:
|
|
229
|
+
reconstruction (torch.Tensor): The reconstructed output (logits) from the model's forward pass.
|
|
230
|
+
y (torch.Tensor): The target data tensor, expected to be one-hot encoded. It is converted to class indices internally for the loss calculation.
|
|
231
|
+
mask (torch.Tensor | None): A boolean mask to exclude missing values from the loss calculation.
|
|
232
|
+
class_weights (torch.Tensor | None): Weights to apply to each class in the loss to handle imbalance.
|
|
565
233
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
"""Batch (=step) size per epoch.
|
|
569
|
-
:noindex:
|
|
234
|
+
Returns:
|
|
235
|
+
torch.Tensor: The computed scalar loss value.
|
|
570
236
|
"""
|
|
571
|
-
|
|
237
|
+
if class_weights is None:
|
|
238
|
+
class_weights = torch.ones(self.num_classes, device=y.device)
|
|
572
239
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
"""Current batch (=step) index.
|
|
576
|
-
:noindex:
|
|
577
|
-
"""
|
|
578
|
-
return self._batch_idx
|
|
240
|
+
logits_flat = reconstruction.view(-1, self.num_classes)
|
|
241
|
+
targets_flat = torch.argmax(y, dim=-1).view(-1)
|
|
579
242
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
"""
|
|
585
|
-
return self._y
|
|
243
|
+
if mask is None:
|
|
244
|
+
mask_flat = torch.ones_like(targets_flat, dtype=torch.bool)
|
|
245
|
+
else:
|
|
246
|
+
mask_flat = mask.view(-1)
|
|
586
247
|
|
|
587
|
-
|
|
588
|
-
def missing_mask(self):
|
|
589
|
-
"""Missing mask of shape (y.shape[0], y.shape[1])
|
|
590
|
-
:noindex:
|
|
591
|
-
"""
|
|
592
|
-
return self._missing_mask
|
|
248
|
+
criterion = MaskedFocalLoss(alpha=class_weights, gamma=self.gamma)
|
|
593
249
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
return self._sample_weight
|
|
250
|
+
reconstruction_loss = criterion(
|
|
251
|
+
logits_flat.to(self.device),
|
|
252
|
+
targets_flat.to(self.device),
|
|
253
|
+
valid_mask=mask_flat.to(self.device),
|
|
254
|
+
)
|
|
600
255
|
|
|
601
|
-
|
|
602
|
-
def batch_size(self, value):
|
|
603
|
-
"""Set batch_size parameter.
|
|
604
|
-
:noindex:
|
|
605
|
-
"""
|
|
606
|
-
self._batch_size = int(value)
|
|
256
|
+
return reconstruction_loss
|
|
607
257
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
"""
|
|
613
|
-
self._batch_idx = int(value)
|
|
258
|
+
def _resolve_activation(
|
|
259
|
+
self, activation: Literal["relu", "elu", "leaky_relu", "selu"]
|
|
260
|
+
) -> torch.nn.Module:
|
|
261
|
+
"""Resolves an activation function module from a string name.
|
|
614
262
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
"""Set y after each epoch.
|
|
618
|
-
:noindex:
|
|
619
|
-
"""
|
|
620
|
-
self._y = value
|
|
263
|
+
Args:
|
|
264
|
+
activation (Literal["relu", "elu", "leaky_relu", "selu"]): The name of the activation function.
|
|
621
265
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
"""Set missing_mask after each epoch.
|
|
625
|
-
:noindex:
|
|
626
|
-
"""
|
|
627
|
-
self._missing_mask = value
|
|
266
|
+
Returns:
|
|
267
|
+
torch.nn.Module: The corresponding instantiated PyTorch activation function module.
|
|
628
268
|
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
"""Set sample_weight after each epoch.
|
|
632
|
-
:noindex:
|
|
269
|
+
Raises:
|
|
270
|
+
ValueError: If the provided activation name is not supported.
|
|
633
271
|
"""
|
|
634
|
-
|
|
272
|
+
activation = activation.lower()
|
|
273
|
+
if activation == "relu":
|
|
274
|
+
return nn.ReLU()
|
|
275
|
+
elif activation == "elu":
|
|
276
|
+
return nn.ELU()
|
|
277
|
+
elif activation in ("leaky_relu", "leakyrelu"):
|
|
278
|
+
return nn.LeakyReLU()
|
|
279
|
+
elif activation == "selu":
|
|
280
|
+
return nn.SELU()
|
|
281
|
+
else:
|
|
282
|
+
msg = f"Activation {activation} not supported."
|
|
283
|
+
self.logger.error(msg)
|
|
284
|
+
raise ValueError(msg)
|