pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pg-sui might be problematic. Click here for more details.
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
- pg_sui-1.6.8.dist-info/RECORD +78 -0
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
- pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
- pg_sui-1.6.8.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +635 -0
- pgsui/data_processing/config.py +576 -0
- pgsui/data_processing/containers.py +1782 -0
- pgsui/data_processing/transformers.py +121 -1103
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +189 -0
- pgsui/electron/app/package-lock.json +6893 -0
- pgsui/electron/app/package.json +50 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +146 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +130 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +59 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
- pgsui/impute/deterministic/imputers/mode.py +679 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +971 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
- pgsui/impute/supervised/base.py +339 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
- pgsui/impute/supervised/imputers/random_forest.py +287 -0
- pgsui/impute/unsupervised/base.py +924 -0
- pgsui/impute/unsupervised/callbacks.py +89 -263
- pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
- pgsui/impute/unsupervised/imputers/vae.py +957 -0
- pgsui/impute/unsupervised/loss_functions.py +158 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
- pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
- pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
- pgsui/impute/unsupervised/models/vae_model.py +259 -618
- pgsui/impute/unsupervised/nn_scorers.py +215 -0
- pgsui/utils/classification_viz.py +591 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +514 -824
- pgsui/utils/scorers.py +212 -438
- pg_sui-1.0.2.1.dist-info/RECORD +0 -75
- pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -735
- pgsui/impute/impute.py +0 -1486
- pgsui/impute/simple_imputers.py +0 -1439
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
- pgsui/impute/unsupervised/keras_classifiers.py +0 -702
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -297
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -214
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
- /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
test/test.py
DELETED
|
@@ -1,297 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
import pprint
|
|
3
|
-
from snpio import GenotypeData
|
|
4
|
-
from pgsui import *
|
|
5
|
-
from pgsui.utils.misc import HiddenPrints
|
|
6
|
-
import matplotlib.pyplot as plt
|
|
7
|
-
import numpy as np
|
|
8
|
-
|
|
9
|
-
from sklearn.metrics import (
|
|
10
|
-
roc_auc_score,
|
|
11
|
-
precision_recall_fscore_support,
|
|
12
|
-
f1_score,
|
|
13
|
-
average_precision_score,
|
|
14
|
-
accuracy_score,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
from sklearn.preprocessing import label_binarize
|
|
18
|
-
|
|
19
|
-
from sklearn.utils.class_weight import compute_class_weight
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# Initialize dictionaries to store metrics for all methods
|
|
23
|
-
all_accuracies = {}
|
|
24
|
-
all_auc_rocs = {}
|
|
25
|
-
all_precisions = {}
|
|
26
|
-
all_recalls = {}
|
|
27
|
-
all_avg_precisions = {}
|
|
28
|
-
all_f1s = {}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def plot_scoring_metrics():
|
|
32
|
-
"""
|
|
33
|
-
Plot the accumulated scoring metrics for all test methods in separate subplots.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
None
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
None: The function generates a grouped bar chart displaying the scoring metrics.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
metrics = [
|
|
43
|
-
"Accuracy",
|
|
44
|
-
"AUC-ROC",
|
|
45
|
-
"Precision",
|
|
46
|
-
"Recall",
|
|
47
|
-
"Average Precision",
|
|
48
|
-
"F1 Score",
|
|
49
|
-
]
|
|
50
|
-
metric_dicts = [
|
|
51
|
-
all_accuracies,
|
|
52
|
-
all_auc_rocs,
|
|
53
|
-
all_precisions,
|
|
54
|
-
all_recalls,
|
|
55
|
-
all_avg_precisions,
|
|
56
|
-
all_f1s,
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
num_metrics = len(metrics)
|
|
60
|
-
fig, axes = plt.subplots(2, num_metrics // 2, figsize=(20, 20))
|
|
61
|
-
|
|
62
|
-
# Loop through each metric and its corresponding dictionary
|
|
63
|
-
colcount = 0
|
|
64
|
-
rowcount = 0
|
|
65
|
-
|
|
66
|
-
for i, (metric, metric_dict) in enumerate(zip(metrics, metric_dicts)):
|
|
67
|
-
if i > 0 and i % num_metrics // 2 == 0:
|
|
68
|
-
rowcount += 1
|
|
69
|
-
colcount = 0
|
|
70
|
-
|
|
71
|
-
methods = list(metric_dict.keys())
|
|
72
|
-
values = list(metric_dict.values())
|
|
73
|
-
|
|
74
|
-
# Find the index of the highest bar
|
|
75
|
-
highest_bar_idx = np.argmax(values)
|
|
76
|
-
|
|
77
|
-
# Create the bar plot on the i-th subplot
|
|
78
|
-
bars = axes[rowcount, colcount].bar(methods, values, color="gray")
|
|
79
|
-
|
|
80
|
-
# Color the highest bar in orange
|
|
81
|
-
bars[highest_bar_idx].set_color("orange")
|
|
82
|
-
|
|
83
|
-
# Rotate x-axis labels
|
|
84
|
-
axes[rowcount, colcount].tick_params(axis="x", rotation=90)
|
|
85
|
-
|
|
86
|
-
# Annotate the bars with the actual values
|
|
87
|
-
for j, v in enumerate(values):
|
|
88
|
-
axes[rowcount, colcount].text(
|
|
89
|
-
j, v, f"{v:.2f}", ha="center", va="bottom"
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
axes[rowcount, colcount].set_title(metric)
|
|
93
|
-
axes[rowcount, colcount].set_ylabel("Score")
|
|
94
|
-
colcount += 1
|
|
95
|
-
|
|
96
|
-
plt.suptitle("Scoring Metrics for All Methods")
|
|
97
|
-
|
|
98
|
-
fig.savefig("scores.png", facecolor="white", bbox_inches="tight")
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
class TestMyClasses(unittest.TestCase):
|
|
102
|
-
def setUp(self):
|
|
103
|
-
with HiddenPrints():
|
|
104
|
-
self.genotype_data = GenotypeData(
|
|
105
|
-
filename="pgsui/example_data/phylip_files/test_n100.phy",
|
|
106
|
-
popmapfile="pgsui/example_data/popmaps/test.popmap",
|
|
107
|
-
guidetree="pgsui/example_data/trees/test.tre",
|
|
108
|
-
qmatrix="pgsui/example_data/trees/test.qmat",
|
|
109
|
-
siterates="pgsui/example_data/trees/test_siterates_n100.txt",
|
|
110
|
-
prefix="test_imputer",
|
|
111
|
-
force_popmap=True,
|
|
112
|
-
plot_format="png",
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
# Create a SimGenotypeDataTransformer instance and use it
|
|
116
|
-
# to simulate missing data
|
|
117
|
-
self.transformer = SimGenotypeDataTransformer(
|
|
118
|
-
genotype_data=self.genotype_data,
|
|
119
|
-
prop_missing=0.2,
|
|
120
|
-
strategy="random",
|
|
121
|
-
)
|
|
122
|
-
self.transformer.fit(self.genotype_data.genotypes_012(fmt="numpy"))
|
|
123
|
-
self.simulated_data = self.genotype_data.copy()
|
|
124
|
-
|
|
125
|
-
self.simulated_data.genotypes_012 = self.transformer.transform(
|
|
126
|
-
self.genotype_data.genotypes_012(fmt="numpy")
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
def _test_class(self, class_instance, do_gridsearch=False):
|
|
130
|
-
print(f"\nMETHOD: {class_instance.__name__}\n")
|
|
131
|
-
|
|
132
|
-
if do_gridsearch:
|
|
133
|
-
# Do a simple test.
|
|
134
|
-
if class_instance in [ImputeRandomForest, ImputeXGBoost]:
|
|
135
|
-
param_grid = {"n_estimators": [50, 100]} # Do a simple test
|
|
136
|
-
elif class_instance in [
|
|
137
|
-
ImputeVAE,
|
|
138
|
-
ImputeStandardAutoEncoder,
|
|
139
|
-
ImputeNLPCA,
|
|
140
|
-
ImputeUBP,
|
|
141
|
-
]:
|
|
142
|
-
param_grid = {"dropout_rate": [0.1, 0.2]}
|
|
143
|
-
elif class_instance == ImputeKNN:
|
|
144
|
-
param_grid = {"n_neighbors": [5, 8]}
|
|
145
|
-
else:
|
|
146
|
-
param_grid = None
|
|
147
|
-
|
|
148
|
-
instance = class_instance(
|
|
149
|
-
self.simulated_data,
|
|
150
|
-
gridparams=param_grid,
|
|
151
|
-
sample_weights=None,
|
|
152
|
-
)
|
|
153
|
-
imputed_data = instance.imputed.genotypes_int
|
|
154
|
-
|
|
155
|
-
# Test that the imputed values are close to the original values
|
|
156
|
-
# accuracy = self.transformer.accuracy(
|
|
157
|
-
# self.genotype_data.genotypes_012(fmt="numpy"), imputed_data
|
|
158
|
-
# )
|
|
159
|
-
|
|
160
|
-
(
|
|
161
|
-
accuracy,
|
|
162
|
-
auc_roc_scores,
|
|
163
|
-
precision_scores,
|
|
164
|
-
recall_scores,
|
|
165
|
-
avg_precision_scores,
|
|
166
|
-
f1,
|
|
167
|
-
) = self._scoring_metrics(
|
|
168
|
-
self.genotype_data.genotypes_int, imputed_data
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
172
|
-
f"ACCURACY: {accuracy}"
|
|
173
|
-
)
|
|
174
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
175
|
-
f"AUC-ROC: {auc_roc_scores}"
|
|
176
|
-
)
|
|
177
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
178
|
-
f"PRECISION: {precision_scores}"
|
|
179
|
-
)
|
|
180
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
181
|
-
f"RECALL: {recall_scores}"
|
|
182
|
-
)
|
|
183
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
184
|
-
f"AVERAGE PRECISION: {avg_precision_scores}"
|
|
185
|
-
)
|
|
186
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
187
|
-
f"F1 SCORE: {f1}"
|
|
188
|
-
)
|
|
189
|
-
print("\n")
|
|
190
|
-
|
|
191
|
-
# Store metrics
|
|
192
|
-
all_accuracies[class_instance.__name__] = accuracy
|
|
193
|
-
all_auc_rocs[class_instance.__name__] = auc_roc_scores
|
|
194
|
-
all_precisions[class_instance.__name__] = precision_scores
|
|
195
|
-
all_recalls[class_instance.__name__] = recall_scores
|
|
196
|
-
all_avg_precisions[class_instance.__name__] = avg_precision_scores
|
|
197
|
-
all_f1s[class_instance.__name__] = f1
|
|
198
|
-
plot_scoring_metrics()
|
|
199
|
-
|
|
200
|
-
def test_ImputeKNN(self):
|
|
201
|
-
self._test_class(ImputeKNN)
|
|
202
|
-
|
|
203
|
-
def test_ImputeRandomForest(self):
|
|
204
|
-
self._test_class(ImputeRandomForest)
|
|
205
|
-
|
|
206
|
-
def test_ImputeXGBoost(self):
|
|
207
|
-
self._test_class(ImputeXGBoost)
|
|
208
|
-
|
|
209
|
-
def test_ImputeVAE(self):
|
|
210
|
-
self._test_class(ImputeVAE)
|
|
211
|
-
|
|
212
|
-
def test_ImputeStandardAutoEncoder(self):
|
|
213
|
-
self._test_class(ImputeStandardAutoEncoder)
|
|
214
|
-
|
|
215
|
-
def test_ImputeUBP(self):
|
|
216
|
-
self._test_class(ImputeUBP)
|
|
217
|
-
|
|
218
|
-
def test_ImputeNLPCA(self):
|
|
219
|
-
self._test_class(ImputeNLPCA)
|
|
220
|
-
|
|
221
|
-
def test_ImputeKNN_grid(self):
|
|
222
|
-
self._test_class(ImputeKNN, do_gridsearch=True)
|
|
223
|
-
|
|
224
|
-
def test_ImputeRandomForest_grid(self):
|
|
225
|
-
self._test_class(ImputeRandomForest, do_gridsearch=True)
|
|
226
|
-
|
|
227
|
-
def test_ImputeXGBoost_grid(self):
|
|
228
|
-
self._test_class(ImputeXGBoost, do_gridsearch=True)
|
|
229
|
-
|
|
230
|
-
def test_ImputeVAE_grid(self):
|
|
231
|
-
self._test_class(ImputeVAE, do_gridsearch=True)
|
|
232
|
-
|
|
233
|
-
def test_ImputeStandardAutoEncoder_grid(self):
|
|
234
|
-
self._test_class(ImputeStandardAutoEncoder, do_gridsearch=True)
|
|
235
|
-
|
|
236
|
-
def test_ImputeUBP_grid(self):
|
|
237
|
-
self._test_class(ImputeUBP, do_gridsearch=True)
|
|
238
|
-
|
|
239
|
-
def test_ImputeNLPCA_grid(self):
|
|
240
|
-
self._test_class(ImputeNLPCA, do_gridsearch=True)
|
|
241
|
-
|
|
242
|
-
def test_ImputePhylo(self):
|
|
243
|
-
self._test_class(ImputePhylo)
|
|
244
|
-
|
|
245
|
-
def test_ImputeAlleleFreq(self):
|
|
246
|
-
self._test_class(ImputeAlleleFreq)
|
|
247
|
-
|
|
248
|
-
def test_ImputeMF(self):
|
|
249
|
-
self._test_class(ImputeMF)
|
|
250
|
-
|
|
251
|
-
def test_ImputeRefAllele(self):
|
|
252
|
-
self._test_class(ImputeRefAllele)
|
|
253
|
-
|
|
254
|
-
def _scoring_metrics(self, y_true, y_pred):
|
|
255
|
-
"""Calcuate AUC-ROC, Precision-Recall, and Average Precision (AP).
|
|
256
|
-
|
|
257
|
-
Args:
|
|
258
|
-
X_true (np.ndarray): True values.
|
|
259
|
-
|
|
260
|
-
X_pred (np.ndarray): Imputed values.
|
|
261
|
-
|
|
262
|
-
Returns:
|
|
263
|
-
List[float]: List of AUC-ROC scores in order of: 0,1,2.
|
|
264
|
-
List[float]: List of precision scores in order of: 0,1,2.
|
|
265
|
-
List[float]: List of recall scores in order of: 0,1,2.
|
|
266
|
-
List[float]: List of average precision scores in order of 0,1,2.
|
|
267
|
-
|
|
268
|
-
"""
|
|
269
|
-
y_true = y_true[self.transformer.sim_missing_mask_]
|
|
270
|
-
y_pred = y_pred[self.transformer.sim_missing_mask_]
|
|
271
|
-
|
|
272
|
-
# Binarize the output
|
|
273
|
-
y_true_bin = label_binarize(y_true, classes=[0, 1, 2])
|
|
274
|
-
y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2])
|
|
275
|
-
|
|
276
|
-
accuracy = accuracy_score(y_true, y_pred)
|
|
277
|
-
|
|
278
|
-
# AUC-ROC score
|
|
279
|
-
auc_roc = roc_auc_score(y_true_bin, y_pred_bin, average="weighted")
|
|
280
|
-
|
|
281
|
-
# Precision-recall score
|
|
282
|
-
precision, recall, _, _ = precision_recall_fscore_support(
|
|
283
|
-
y_true_bin, y_pred_bin, average="weighted"
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
# Average precision score
|
|
287
|
-
avg_precision = average_precision_score(
|
|
288
|
-
y_true_bin, y_pred_bin, average="weighted"
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
f1 = f1_score(y_true_bin, y_pred_bin, average="weighted")
|
|
292
|
-
|
|
293
|
-
return (accuracy, auc_roc, precision, recall, avg_precision, f1)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
if __name__ == "__main__":
|
|
297
|
-
unittest.main()
|
test/test_pgsui.py
DELETED
|
@@ -1,374 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
|
-
# Standard library imports
|
|
4
|
-
import os
|
|
5
|
-
import sys
|
|
6
|
-
|
|
7
|
-
from contextlib import redirect_stdout
|
|
8
|
-
|
|
9
|
-
try:
|
|
10
|
-
from importlib.resources import files, as_file
|
|
11
|
-
except ImportError:
|
|
12
|
-
# Try backported to PY<37 `importlib_resources`.
|
|
13
|
-
from importlib_resources import files, as_file
|
|
14
|
-
|
|
15
|
-
import numpy as np
|
|
16
|
-
import pandas as pd
|
|
17
|
-
import scipy.stats as stats
|
|
18
|
-
|
|
19
|
-
from sklearn_genetic.space import Continuous, Categorical, Integer
|
|
20
|
-
|
|
21
|
-
from pgsui import GenotypeData
|
|
22
|
-
from pgsui.impute.estimators import *
|
|
23
|
-
from pgsui.impute.simple_imputers import *
|
|
24
|
-
from pgsui.example_data import structure_files, phylip_files, popmaps, trees
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def main():
|
|
28
|
-
"""Test all PG-SUI Methods.
|
|
29
|
-
|
|
30
|
-
Can be invoked by typing 'pgsuitest' on the command line.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
# Redirect stdout to logfile
|
|
34
|
-
with open("pgsuitest.log.txt", "w") as logfile:
|
|
35
|
-
with redirect_stdout(logfile):
|
|
36
|
-
testaln = {
|
|
37
|
-
"phylip": "test_n10.phy",
|
|
38
|
-
"structure2row": "test.nopops.2row.10sites.str",
|
|
39
|
-
"structure2rowPopID": "test.pops.2row.10sites.str",
|
|
40
|
-
"structure1row": "test.nopops.1row.10sites.str",
|
|
41
|
-
"structure1rowPopID": "test.pops.1row.10sites.str",
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
popmap = "test.popmap"
|
|
45
|
-
tre = "test.tre"
|
|
46
|
-
iqtre = "test.iqtree"
|
|
47
|
-
qmat = "test.qmat"
|
|
48
|
-
siterateiqtree = "test_n10.rate"
|
|
49
|
-
siterate = "test_siterates_n10.txt"
|
|
50
|
-
prefix = "setuptest"
|
|
51
|
-
t = ".xxinput_treexx.tre" # temporary treefile
|
|
52
|
-
|
|
53
|
-
strfile = files(structure_files).joinpath(testaln["structure2row"])
|
|
54
|
-
popmapfile = files(popmaps).joinpath(popmap)
|
|
55
|
-
treefile = files(trees).joinpath(tre)
|
|
56
|
-
iqtreeqmatfile = files(trees).joinpath(iqtre)
|
|
57
|
-
qmatfile = files(trees).joinpath(qmat)
|
|
58
|
-
siteratefileiqtree = files(trees).joinpath(siterateiqtree)
|
|
59
|
-
siteratefile = files(trees).joinpath(siterate)
|
|
60
|
-
|
|
61
|
-
with as_file(popmapfile) as m, as_file(
|
|
62
|
-
treefile
|
|
63
|
-
) as guidetree, as_file(iqtreeqmatfile) as i, as_file(
|
|
64
|
-
qmatfile
|
|
65
|
-
) as q, as_file(
|
|
66
|
-
siteratefileiqtree
|
|
67
|
-
) as siq, as_file(
|
|
68
|
-
siteratefile
|
|
69
|
-
) as s:
|
|
70
|
-
|
|
71
|
-
# Added this code block because for some reason toytree won't
|
|
72
|
-
# read the as_file() temporary file using the context manager.
|
|
73
|
-
with open(guidetree, "r") as fin:
|
|
74
|
-
input_tree = fin.read()
|
|
75
|
-
with open(t, "w") as fout:
|
|
76
|
-
fout.write(input_tree)
|
|
77
|
-
|
|
78
|
-
print("############################################")
|
|
79
|
-
print("### TESTING GenotypeData WITH EACH FILETYPE")
|
|
80
|
-
print("############################################")
|
|
81
|
-
print("\n")
|
|
82
|
-
|
|
83
|
-
for ft, aln in testaln.items():
|
|
84
|
-
if ft == "phylip":
|
|
85
|
-
data_dir = phylip_files
|
|
86
|
-
else:
|
|
87
|
-
data_dir = structure_files
|
|
88
|
-
|
|
89
|
-
alnfile = files(data_dir).joinpath(aln)
|
|
90
|
-
|
|
91
|
-
print("--------------------------------------------------")
|
|
92
|
-
print(f"--- Testing GenotypeData with {ft} filetype...")
|
|
93
|
-
print("--------------------------------------------------")
|
|
94
|
-
print("\n")
|
|
95
|
-
|
|
96
|
-
with as_file(alnfile) as a:
|
|
97
|
-
data = GenotypeData(
|
|
98
|
-
filename=a,
|
|
99
|
-
filetype=ft,
|
|
100
|
-
popmapfile=m,
|
|
101
|
-
guidetree=t,
|
|
102
|
-
qmatrix_iqtree=i,
|
|
103
|
-
siterates_iqtree=siq,
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
print("-------------------------------------------------------")
|
|
107
|
-
print("--- Testing GenotypeData with non-iqtree rates files...")
|
|
108
|
-
print("-------------------------------------------------------")
|
|
109
|
-
print("\n")
|
|
110
|
-
|
|
111
|
-
with as_file(strfile) as a:
|
|
112
|
-
data = GenotypeData(
|
|
113
|
-
filename=a,
|
|
114
|
-
filetype="structure2row",
|
|
115
|
-
popmapfile=m,
|
|
116
|
-
guidetree=t,
|
|
117
|
-
qmatrix=q,
|
|
118
|
-
siterates=s,
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
data = GenotypeData(
|
|
122
|
-
filename=a,
|
|
123
|
-
filetype="structure2row",
|
|
124
|
-
popmapfile=m,
|
|
125
|
-
guidetree=t,
|
|
126
|
-
qmatrix_iqtree=i,
|
|
127
|
-
siterates_iqtree=siq,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
print("++++++++++++++++++++++++++++++++")
|
|
131
|
-
print("+++ SUCCESS!")
|
|
132
|
-
print("++++++++++++++++++++++++++++++++")
|
|
133
|
-
print("\n")
|
|
134
|
-
|
|
135
|
-
print("################################")
|
|
136
|
-
print("### TESTING SIMPLE IMPUTERS...")
|
|
137
|
-
print("################################")
|
|
138
|
-
print("\n")
|
|
139
|
-
|
|
140
|
-
print("-----------------------------------------------------")
|
|
141
|
-
print("--- Testing ImputeAlleleFreq by-population...")
|
|
142
|
-
print("-----------------------------------------------------")
|
|
143
|
-
print("\n")
|
|
144
|
-
|
|
145
|
-
afpops = ImputeAlleleFreq(
|
|
146
|
-
genotype_data=data,
|
|
147
|
-
by_populations=True,
|
|
148
|
-
prefix=prefix,
|
|
149
|
-
write_output=False,
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
print("-----------------------------------------------------")
|
|
153
|
-
print("--- Testing ImputeAlleleFreq global...")
|
|
154
|
-
print("-----------------------------------------------------")
|
|
155
|
-
print("\n")
|
|
156
|
-
|
|
157
|
-
afpops = ImputeAlleleFreq(
|
|
158
|
-
genotype_data=data,
|
|
159
|
-
by_populations=False,
|
|
160
|
-
prefix=prefix,
|
|
161
|
-
write_output=False,
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
print("-----------------------------------------------------")
|
|
165
|
-
print("--- Testing ImputePhylo...")
|
|
166
|
-
print("-----------------------------------------------------")
|
|
167
|
-
print("\n")
|
|
168
|
-
|
|
169
|
-
phylo = ImputePhylo(
|
|
170
|
-
genotype_data=data,
|
|
171
|
-
prefix=prefix,
|
|
172
|
-
disable_progressbar=True,
|
|
173
|
-
write_output=False,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
print("-----------------------------------------------------")
|
|
177
|
-
print("--- Testing ImputeNMF...")
|
|
178
|
-
print("-----------------------------------------------------")
|
|
179
|
-
print("\n")
|
|
180
|
-
|
|
181
|
-
mf = ImputeNMF(
|
|
182
|
-
genotype_data=data,
|
|
183
|
-
prefix=prefix,
|
|
184
|
-
write_output=False,
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
print("++++++++++++++++++++++++++++++++")
|
|
188
|
-
print("+++ SUCCESS!")
|
|
189
|
-
print("++++++++++++++++++++++++++++++++")
|
|
190
|
-
print("\n")
|
|
191
|
-
|
|
192
|
-
##############################################
|
|
193
|
-
### Make gridparams
|
|
194
|
-
##############################################
|
|
195
|
-
|
|
196
|
-
# For randomizedsearchcv
|
|
197
|
-
# Number of trees in random forest
|
|
198
|
-
n_estimators = [
|
|
199
|
-
int(x) for x in np.linspace(start=100, stop=1000, num=10)
|
|
200
|
-
]
|
|
201
|
-
|
|
202
|
-
# Number of features to consider at every split
|
|
203
|
-
max_features = ["sqrt", "log2"]
|
|
204
|
-
|
|
205
|
-
# Maximum number of levels in the tree
|
|
206
|
-
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
|
|
207
|
-
max_depth.append(None)
|
|
208
|
-
|
|
209
|
-
# Minimmum number of samples required to split a node
|
|
210
|
-
min_samples_split = [int(x) for x in np.linspace(2, 10, num=5)]
|
|
211
|
-
|
|
212
|
-
# Minimum number of samples required at each leaf node
|
|
213
|
-
min_samples_leaf = [int(x) for x in np.linspace(1, 5, num=5)]
|
|
214
|
-
|
|
215
|
-
# Proportion of dataset to use with bootstrapping
|
|
216
|
-
# max_samples = [x for x in np.linspace(0.5, 1.0, num=6)]
|
|
217
|
-
|
|
218
|
-
# Random Forest gridparams - RandomizedSearchCV
|
|
219
|
-
grid_params_random = {
|
|
220
|
-
"max_features": max_features,
|
|
221
|
-
"max_depth": max_depth,
|
|
222
|
-
"min_samples_split": min_samples_split,
|
|
223
|
-
"min_samples_leaf": min_samples_leaf,
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
# Genetic Algorithm grid_params
|
|
227
|
-
grid_params_ga = {
|
|
228
|
-
"max_features": Categorical(["sqrt", "log2"]),
|
|
229
|
-
"min_samples_split": Integer(2, 10),
|
|
230
|
-
"min_samples_leaf": Integer(1, 10),
|
|
231
|
-
"max_depth": Integer(2, 110),
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
print("#################################")
|
|
235
|
-
print("### TESTING IterativeImputer...")
|
|
236
|
-
print("#################################")
|
|
237
|
-
print("\n")
|
|
238
|
-
|
|
239
|
-
print("-----------------------------------------------------")
|
|
240
|
-
print(
|
|
241
|
-
"--- Testing ImputeRandomForest with randomized grid\n"
|
|
242
|
-
"--- search and initial_strategy == 'populations'..."
|
|
243
|
-
)
|
|
244
|
-
print("-----------------------------------------------------")
|
|
245
|
-
print("\n")
|
|
246
|
-
|
|
247
|
-
# Random forest imputation with RandomizedSearchCV grid search
|
|
248
|
-
rf_imp = ImputeRandomForest(
|
|
249
|
-
data,
|
|
250
|
-
prefix=prefix,
|
|
251
|
-
n_estimators=50,
|
|
252
|
-
n_nearest_features=2,
|
|
253
|
-
gridparams=grid_params_random,
|
|
254
|
-
cv=3,
|
|
255
|
-
grid_iter=40,
|
|
256
|
-
n_jobs=-1,
|
|
257
|
-
max_iter=2,
|
|
258
|
-
column_subset=1.0,
|
|
259
|
-
ga=False,
|
|
260
|
-
disable_progressbar=True,
|
|
261
|
-
extratrees=False,
|
|
262
|
-
mutation_probability=0.1,
|
|
263
|
-
chunk_size=1.0,
|
|
264
|
-
initial_strategy="populations",
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
print("-----------------------------------------------------")
|
|
268
|
-
print(
|
|
269
|
-
"--- Testing ImputeRandomForest with GA grid search and\n"
|
|
270
|
-
"--- initial_strategy == 'phylogeny'..."
|
|
271
|
-
)
|
|
272
|
-
print("-----------------------------------------------------")
|
|
273
|
-
print("\n")
|
|
274
|
-
|
|
275
|
-
# Genetic Algorithm grid search Test
|
|
276
|
-
rf_imp2 = ImputeRandomForest(
|
|
277
|
-
data,
|
|
278
|
-
prefix=prefix,
|
|
279
|
-
n_estimators=50,
|
|
280
|
-
n_nearest_features=2,
|
|
281
|
-
gridparams=grid_params_ga,
|
|
282
|
-
cv=3,
|
|
283
|
-
grid_iter=40,
|
|
284
|
-
n_jobs=-1,
|
|
285
|
-
max_iter=2,
|
|
286
|
-
column_subset=1.0,
|
|
287
|
-
ga=True,
|
|
288
|
-
disable_progressbar=True,
|
|
289
|
-
extratrees=False,
|
|
290
|
-
chunk_size=1.0,
|
|
291
|
-
initial_strategy="phylogeny",
|
|
292
|
-
)
|
|
293
|
-
|
|
294
|
-
print("++++++++++++++++++++++++++++++++")
|
|
295
|
-
print("+++ SUCCESS!")
|
|
296
|
-
print("++++++++++++++++++++++++++++++++")
|
|
297
|
-
print("\n")
|
|
298
|
-
|
|
299
|
-
print("#################################")
|
|
300
|
-
print("TESTING NEURAL NETWORKS...")
|
|
301
|
-
print("#################################")
|
|
302
|
-
print("\n")
|
|
303
|
-
|
|
304
|
-
print("-----------------------------------------------------")
|
|
305
|
-
print(
|
|
306
|
-
"--- Testing VAE with validation procedure with\n"
|
|
307
|
-
"--- intial_strategy='populations'..."
|
|
308
|
-
)
|
|
309
|
-
print("-----------------------------------------------------")
|
|
310
|
-
print("\n")
|
|
311
|
-
|
|
312
|
-
vae = ImputeVAE(
|
|
313
|
-
genotype_data=data,
|
|
314
|
-
prefix=prefix,
|
|
315
|
-
disable_progressbar=True,
|
|
316
|
-
validation_only=1.0,
|
|
317
|
-
initial_strategy="populations",
|
|
318
|
-
cv=3,
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
print("-----------------------------------------------------")
|
|
322
|
-
print(
|
|
323
|
-
"--- Testing ImputeNLPCA with\n"
|
|
324
|
-
"--- initial_strategy == 'phylogeny'..."
|
|
325
|
-
)
|
|
326
|
-
print("-----------------------------------------------------")
|
|
327
|
-
print("\n")
|
|
328
|
-
|
|
329
|
-
nlpca = ImputeNLPCA(
|
|
330
|
-
data,
|
|
331
|
-
n_components=3,
|
|
332
|
-
initial_strategy="phylogeny",
|
|
333
|
-
disable_progressbar=True,
|
|
334
|
-
cv=3,
|
|
335
|
-
hidden_activation="elu",
|
|
336
|
-
hidden_layer_sizes="midpoint",
|
|
337
|
-
validation_only=None,
|
|
338
|
-
num_hidden_layers=1,
|
|
339
|
-
learning_rate=0.1,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
print("-------------------------------------------------------")
|
|
343
|
-
print("--- Testing ImputeUBP with initial_strategy == 'nmf'...")
|
|
344
|
-
print("-------------------------------------------------------")
|
|
345
|
-
print("\n")
|
|
346
|
-
|
|
347
|
-
ubp = ImputeUBP(
|
|
348
|
-
genotype_data=data,
|
|
349
|
-
initial_strategy="nmf",
|
|
350
|
-
disable_progressbar=True,
|
|
351
|
-
validation_only=None,
|
|
352
|
-
learning_rate=0.1,
|
|
353
|
-
num_hidden_layers=1,
|
|
354
|
-
hidden_layer_sizes=1,
|
|
355
|
-
hidden_activation="elu",
|
|
356
|
-
cv=3,
|
|
357
|
-
n_components=3,
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
print("++++++++++++++++++++++++++++++++")
|
|
361
|
-
print("+++ SUCCESS!")
|
|
362
|
-
print("++++++++++++++++++++++++++++++++")
|
|
363
|
-
print("\n")
|
|
364
|
-
|
|
365
|
-
# Try to remove temporary treefile.
|
|
366
|
-
try:
|
|
367
|
-
os.remove(t)
|
|
368
|
-
except OSError:
|
|
369
|
-
pass
|
|
370
|
-
|
|
371
|
-
print("######################################")
|
|
372
|
-
print("### ALL TESTS PASSED SUCCESSFULLY!")
|
|
373
|
-
print("######################################")
|
|
374
|
-
print("\n")
|