pg-sui 0.2.3__py3-none-any.whl → 1.6.16a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg_sui-1.6.16a3.dist-info/METADATA +292 -0
- pg_sui-1.6.16a3.dist-info/RECORD +81 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.16a3.dist-info}/WHEEL +1 -1
- pg_sui-1.6.16a3.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.3.dist-info → pg_sui-1.6.16a3.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.16a3.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +922 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1436 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1121 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1361 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1666 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1660 -0
- pgsui/impute/unsupervised/imputers/vae.py +1316 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.3.dist-info/METADATA +0 -322
- pg_sui-0.2.3.dist-info/RECORD +0 -75
- pg_sui-0.2.3.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
test/pg_sui_testing.py
DELETED
|
@@ -1,523 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
|
-
# Standard library imports
|
|
4
|
-
import argparse
|
|
5
|
-
import sys
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import scipy.stats as stats
|
|
10
|
-
|
|
11
|
-
from sklearn_genetic.space import Continuous, Categorical, Integer
|
|
12
|
-
|
|
13
|
-
# from pgsui import GenotypeData
|
|
14
|
-
from snpio import GenotypeData
|
|
15
|
-
from impute.estimators import (
|
|
16
|
-
ImputeNLPCA,
|
|
17
|
-
ImputeUBP,
|
|
18
|
-
ImputeRandomForest,
|
|
19
|
-
ImputeVAE,
|
|
20
|
-
)
|
|
21
|
-
from impute.simple_imputers import ImputePhylo
|
|
22
|
-
|
|
23
|
-
# from snpio import GenotypeData
|
|
24
|
-
# from impute.estimators import *
|
|
25
|
-
# from impute.simple_imputers import ImputeAlleleFreq, ImputePhylo
|
|
26
|
-
|
|
27
|
-
# from read_input import GenotypeData
|
|
28
|
-
# from estimators import *
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def main():
|
|
32
|
-
"""Class instantiations and main package body"""
|
|
33
|
-
|
|
34
|
-
args = get_arguments()
|
|
35
|
-
|
|
36
|
-
if args.str and args.phylip:
|
|
37
|
-
sys.exit("Error: Only one file type can be specified")
|
|
38
|
-
|
|
39
|
-
# If VCF file is specified.
|
|
40
|
-
if args.str:
|
|
41
|
-
if not args.pop_ids and args.popmap is None:
|
|
42
|
-
raise TypeError("Either --pop_ids or --popmap must be specified\n")
|
|
43
|
-
|
|
44
|
-
if args.pop_ids:
|
|
45
|
-
print("\n--pop_ids was specified as column 2\n")
|
|
46
|
-
else:
|
|
47
|
-
print(
|
|
48
|
-
"\n--pop_ids was not specified; "
|
|
49
|
-
"using popmap file to get population IDs\n"
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
if args.onerow_perind:
|
|
53
|
-
print("\nUsing one row per individual...\n")
|
|
54
|
-
else:
|
|
55
|
-
print("\nUsing two rows per individual...\n")
|
|
56
|
-
|
|
57
|
-
if args.onerow_perind:
|
|
58
|
-
data = GenotypeData(
|
|
59
|
-
filename=args.str,
|
|
60
|
-
filetype="structure1row",
|
|
61
|
-
popmapfile=args.popmap,
|
|
62
|
-
guidetree=args.treefile,
|
|
63
|
-
qmatrix_iqtree=args.iqtree,
|
|
64
|
-
)
|
|
65
|
-
else:
|
|
66
|
-
data = GenotypeData(
|
|
67
|
-
filename=args.str,
|
|
68
|
-
filetype="structure2row",
|
|
69
|
-
popmapfile=args.popmap,
|
|
70
|
-
guidetree=args.treefile,
|
|
71
|
-
qmatrix_iqtree=args.iqtree,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
if args.phylip:
|
|
75
|
-
if args.pop_ids or args.onerow_perind:
|
|
76
|
-
print(
|
|
77
|
-
"\nPhylip file was used with structure arguments; ignoring "
|
|
78
|
-
"structure file arguments\n"
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
if args.popmap is None:
|
|
82
|
-
raise TypeError("No popmap file supplied with PHYLIP file\n")
|
|
83
|
-
|
|
84
|
-
data = GenotypeData(
|
|
85
|
-
filename=args.phylip,
|
|
86
|
-
filetype="phylip",
|
|
87
|
-
popmapfile=args.popmap,
|
|
88
|
-
guidetree=args.treefile,
|
|
89
|
-
qmatrix_iqtree=args.iqtree,
|
|
90
|
-
siterates_iqtree="pgsui/example_data/trees/test_n10.rate",
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
if args.resume_imputed:
|
|
94
|
-
pass
|
|
95
|
-
# data.read_imputed(args.resume_imputed, impute_methods="rf")
|
|
96
|
-
# data.write_imputed(data.imputed_rf_df, args.prefix)
|
|
97
|
-
|
|
98
|
-
else:
|
|
99
|
-
# For randomizedsearchcv
|
|
100
|
-
# Number of trees in random forest
|
|
101
|
-
n_estimators = [
|
|
102
|
-
int(x) for x in np.linspace(start=100, stop=1000, num=10)
|
|
103
|
-
]
|
|
104
|
-
|
|
105
|
-
# Number of features to consider at every split
|
|
106
|
-
max_features = ["sqrt", "log2"]
|
|
107
|
-
|
|
108
|
-
# Maximum number of levels in the tree
|
|
109
|
-
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
|
|
110
|
-
max_depth.append(None)
|
|
111
|
-
|
|
112
|
-
# Minimmum number of samples required to split a node
|
|
113
|
-
min_samples_split = [int(x) for x in np.linspace(2, 10, num=5)]
|
|
114
|
-
|
|
115
|
-
# Minimum number of samples required at each leaf node
|
|
116
|
-
min_samples_leaf = [int(x) for x in np.linspace(1, 5, num=5)]
|
|
117
|
-
|
|
118
|
-
# Proportion of dataset to use with bootstrapping
|
|
119
|
-
# max_samples = [x for x in np.linspace(0.5, 1.0, num=6)]
|
|
120
|
-
|
|
121
|
-
# # Random Forest gridparams - RandomizedSearchCV
|
|
122
|
-
# grid_params = {
|
|
123
|
-
# "max_features": max_features,
|
|
124
|
-
# "max_depth": max_depth,
|
|
125
|
-
# "min_samples_split": min_samples_split,
|
|
126
|
-
# "min_samples_leaf": min_samples_leaf,
|
|
127
|
-
# }
|
|
128
|
-
|
|
129
|
-
# Random Forest gridparams - Genetic Algorithms
|
|
130
|
-
# grid_params = {
|
|
131
|
-
# "n_estimators": Integer(100, 500),
|
|
132
|
-
# "max_features": max_features,
|
|
133
|
-
# "max_depth": max_depth,
|
|
134
|
-
# "min_samples_split": min_samples_split,
|
|
135
|
-
# "min_samples_leaf": min_samples_leaf,
|
|
136
|
-
# "max_samples": max_samples
|
|
137
|
-
# }
|
|
138
|
-
|
|
139
|
-
# # Genetic Algorithm grid_params
|
|
140
|
-
# grid_params = {
|
|
141
|
-
# "max_features": Categorical(["sqrt", "log2"]),
|
|
142
|
-
# "min_samples_split": Integer(2, 10),
|
|
143
|
-
# "min_samples_leaf": Integer(1, 10),
|
|
144
|
-
# "max_depth": Integer(2, 110),
|
|
145
|
-
# }
|
|
146
|
-
|
|
147
|
-
# Bayesian Ridge gridparams - RandomizedSearchCV
|
|
148
|
-
# grid_params = {
|
|
149
|
-
# "alpha_1": stats.loguniform(1e-6, 0.01),
|
|
150
|
-
# "alpha_2": stats.loguniform(1e-6, 0.01),
|
|
151
|
-
# "lambda_1": stats.loguniform(1e-6, 0.01),
|
|
152
|
-
# "lambda_2": stats.loguniform(1e-6, 0.01),
|
|
153
|
-
# }
|
|
154
|
-
|
|
155
|
-
# # Bayesian Ridge gridparams - Genetic algorithm
|
|
156
|
-
# grid_params = {
|
|
157
|
-
# "alpha_1": Continuous(1e-6, 1e-3, distribution="log-uniform"),
|
|
158
|
-
# "alpha_2": Continuous(1e-6, 1e-3, distribution="log-uniform"),
|
|
159
|
-
# "lambda_1": Continuous(1e-6, 1e-3, distribution="log-uniform"),
|
|
160
|
-
# "lambda_2": Continuous(1e-6, 1e-3, distribution="log-uniform")
|
|
161
|
-
# }
|
|
162
|
-
|
|
163
|
-
# # Random forest imputation with genetic algorithm grid search
|
|
164
|
-
# rf_imp = ImputeRandomForest(
|
|
165
|
-
# data,
|
|
166
|
-
# prefix=args.prefix,
|
|
167
|
-
# n_estimators=50,
|
|
168
|
-
# n_nearest_features=1,
|
|
169
|
-
# gridparams=grid_params,
|
|
170
|
-
# cv=3,
|
|
171
|
-
# grid_iter=40,
|
|
172
|
-
# n_jobs=4,
|
|
173
|
-
# max_iter=2,
|
|
174
|
-
# column_subset=1.0,
|
|
175
|
-
# ga=False,
|
|
176
|
-
# disable_progressbar=True,
|
|
177
|
-
# extratrees=False,
|
|
178
|
-
# mutation_probability=0.1,
|
|
179
|
-
# progress_update_percent=20,
|
|
180
|
-
# chunk_size=1.0,
|
|
181
|
-
# initial_strategy="phylogeny",
|
|
182
|
-
# )
|
|
183
|
-
|
|
184
|
-
# # Genetic Algorithm grid search Test
|
|
185
|
-
# rf_imp2 = ImputeRandomForest(
|
|
186
|
-
# data,
|
|
187
|
-
# prefix=args.prefix,
|
|
188
|
-
# n_estimators=50,
|
|
189
|
-
# n_nearest_features=2,
|
|
190
|
-
# gridparams=grid_params,
|
|
191
|
-
# cv=3,
|
|
192
|
-
# grid_iter=40,
|
|
193
|
-
# n_jobs=-1,
|
|
194
|
-
# max_iter=2,
|
|
195
|
-
# column_subset=1.0,
|
|
196
|
-
# ga=True,
|
|
197
|
-
# disable_progressbar=True,
|
|
198
|
-
# extratrees=False,
|
|
199
|
-
# chunk_size=1.0,
|
|
200
|
-
# initial_strategy="phylogeny",
|
|
201
|
-
# )
|
|
202
|
-
|
|
203
|
-
# rfdata = rf_imp.imputed
|
|
204
|
-
# print(rfdata.genotypes012_df)
|
|
205
|
-
|
|
206
|
-
# rf_data = rf_imp.imputed
|
|
207
|
-
# print(data.genotypes012_df)
|
|
208
|
-
# print(rf_data.genotypes012_df)
|
|
209
|
-
|
|
210
|
-
# imp_decoded = data.decode_imputed(rf_imp.imputed)
|
|
211
|
-
# print(imp_decoded)
|
|
212
|
-
|
|
213
|
-
# # RandomizedSearchCV Test
|
|
214
|
-
# rf_imp = ImputeRandomForest(
|
|
215
|
-
# data,
|
|
216
|
-
# prefix=args.prefix,
|
|
217
|
-
# n_estimators=50,
|
|
218
|
-
# n_nearest_features=3,
|
|
219
|
-
# gridparams=grid_params,
|
|
220
|
-
# cv=3,
|
|
221
|
-
# grid_iter=40,
|
|
222
|
-
# n_jobs=4,
|
|
223
|
-
# max_iter=2,
|
|
224
|
-
# column_subset=5,
|
|
225
|
-
# ga=False,
|
|
226
|
-
# disable_progressbar=False,
|
|
227
|
-
# extratrees=False,
|
|
228
|
-
# progress_update_percent=20,
|
|
229
|
-
# chunk_size=0.2,
|
|
230
|
-
# initial_strategy="phylogeny",
|
|
231
|
-
# )
|
|
232
|
-
|
|
233
|
-
# lgbm = ImputeLightGBM(
|
|
234
|
-
# data,
|
|
235
|
-
# prefix=args.prefix,
|
|
236
|
-
# cv=3,
|
|
237
|
-
# n_jobs=4,
|
|
238
|
-
# n_estimators=50,
|
|
239
|
-
# disable_progressbar=True,
|
|
240
|
-
# chunk_size=0.2,
|
|
241
|
-
# validation_only=0.1,
|
|
242
|
-
# n_nearest_features=3,
|
|
243
|
-
# max_iter=2,
|
|
244
|
-
# initial_strategy="populations",
|
|
245
|
-
# )
|
|
246
|
-
|
|
247
|
-
# vae = ImputeVAE(
|
|
248
|
-
# genotype_data=data,
|
|
249
|
-
# prefix=args.prefix,
|
|
250
|
-
# disable_progressbar=True,
|
|
251
|
-
# validation_only=None,
|
|
252
|
-
# initial_strategy="populations",
|
|
253
|
-
# )
|
|
254
|
-
|
|
255
|
-
# vae_gtdata = vae.imputed
|
|
256
|
-
# print(vae_gtdata.genotypes012_df)
|
|
257
|
-
|
|
258
|
-
# complete_encoded = imputer.train(train_epochs=300, batch_size=256)
|
|
259
|
-
# print(complete_encoded)
|
|
260
|
-
|
|
261
|
-
# rf_imp = ImputeRandomForest(
|
|
262
|
-
# data,
|
|
263
|
-
# prefix=args.prefix,
|
|
264
|
-
# n_estimators=50,
|
|
265
|
-
# n_nearest_features=3,
|
|
266
|
-
# n_jobs=4,
|
|
267
|
-
# max_iter=2,
|
|
268
|
-
# disable_progressbar=True,
|
|
269
|
-
# extratrees=False,
|
|
270
|
-
# max_features="sqrt",
|
|
271
|
-
# min_samples_split=5,
|
|
272
|
-
# min_samples_leaf=2,
|
|
273
|
-
# max_depth=30,
|
|
274
|
-
# cv=3,
|
|
275
|
-
# validation_only=0.3,
|
|
276
|
-
# chunk_size=1.0,
|
|
277
|
-
# initial_strategy="populations",
|
|
278
|
-
# )
|
|
279
|
-
|
|
280
|
-
# afpops = ImputeAlleleFreq(
|
|
281
|
-
# genotype_data=data,
|
|
282
|
-
# by_populations=True,
|
|
283
|
-
# prefix=args.prefix,
|
|
284
|
-
# )
|
|
285
|
-
|
|
286
|
-
# print(data.genotypes012_df)
|
|
287
|
-
# print(afpops.genotypes012_df)
|
|
288
|
-
|
|
289
|
-
# br_imp = ImputeBayesianRidge(data, prefix=args.prefix, n_iter=100, gridparams=grid_params, grid_iter=3, cv=3, n_jobs=4, max_iter=5, n_nearest_features=3, column_subset=4, ga=False, disable_progressbar=True, progress_update_percent=20, chunk_size=1.0)
|
|
290
|
-
|
|
291
|
-
# aftestpops = ImputeAlleleFreq(
|
|
292
|
-
# genotype_data=data, by_populations=True, prefix=args.prefix
|
|
293
|
-
# )
|
|
294
|
-
|
|
295
|
-
# aftestpops_data = aftestpops.imputed
|
|
296
|
-
|
|
297
|
-
# print(data.genotypes012_df)
|
|
298
|
-
# print(aftestpops_data.genotypes012_df)
|
|
299
|
-
|
|
300
|
-
# vae = ImputeVAE(
|
|
301
|
-
# gt=np.array([[0, 1], [-9, 1], [2, -9]]),
|
|
302
|
-
# initial_strategy="most_frequent",
|
|
303
|
-
# cv=3,
|
|
304
|
-
# validation_only=None,
|
|
305
|
-
# )
|
|
306
|
-
|
|
307
|
-
# vae_data = vae.imputed
|
|
308
|
-
|
|
309
|
-
# print(data.genotypes012_df)
|
|
310
|
-
# print(vae_data.genotypes012_df)
|
|
311
|
-
|
|
312
|
-
# For GridSearchCV. Generate parameters to sample from.
|
|
313
|
-
learning_rate = [float(10) ** x for x in np.arange(-4, 0)]
|
|
314
|
-
l1_penalty = [float(10) ** x for x in np.arange(-6, -1)]
|
|
315
|
-
l1_penalty.append(0.0)
|
|
316
|
-
l2_penalty = [float(10) ** x for x in np.arange(-6, -1)]
|
|
317
|
-
l2_penalty.append(0.0)
|
|
318
|
-
hidden_activation = ["elu", "relu"]
|
|
319
|
-
num_hidden_layers = [1, 2, 3, 4, 5]
|
|
320
|
-
hidden_layer_sizes = ["sqrt", "midpoint"]
|
|
321
|
-
n_components = [2, 3]
|
|
322
|
-
dropout_rate = [round(x, 1) for x in np.arange(0.0, 1.0, 0.1)]
|
|
323
|
-
batch_size = [16, 32, 48, 64]
|
|
324
|
-
optimizer = ["adam", "sgd", "adagrad"]
|
|
325
|
-
|
|
326
|
-
# grid_params = {
|
|
327
|
-
# "learning_rate": Continuous(1e-6, 0.1, distribution="log-uniform"),
|
|
328
|
-
# "l2_penalty": Continuous(1e-6, 0.01, distribution="uniform"),
|
|
329
|
-
# "n_components": Integer(2, 3),
|
|
330
|
-
# # "hidden_activation": Categorical(["elu", "relu"]),
|
|
331
|
-
# }
|
|
332
|
-
|
|
333
|
-
grid_params = {
|
|
334
|
-
# "learning_rate": learning_rate,
|
|
335
|
-
# "l1_penalty": l1_penalty,
|
|
336
|
-
"l2_penalty": l2_penalty,
|
|
337
|
-
# "hidden_activation": hidden_activation,
|
|
338
|
-
# "hidden_layer_sizes": hidden_layer_sizes,
|
|
339
|
-
"n_components": n_components,
|
|
340
|
-
# "dropout_rate": dropout_rate,
|
|
341
|
-
# "batch_size": batch_size,
|
|
342
|
-
# "optimizer": optimizer,
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
ubp = ImputeUBP(
|
|
346
|
-
data,
|
|
347
|
-
disable_progressbar=False,
|
|
348
|
-
cv=3,
|
|
349
|
-
column_subset=1.0,
|
|
350
|
-
validation_split=0.0,
|
|
351
|
-
learning_rate=0.1,
|
|
352
|
-
num_hidden_layers=1,
|
|
353
|
-
verbose=1,
|
|
354
|
-
dropout_rate=0.2,
|
|
355
|
-
hidden_activation="elu",
|
|
356
|
-
batch_size=64,
|
|
357
|
-
l1_penalty=1e-6,
|
|
358
|
-
l2_penalty=1e-6,
|
|
359
|
-
gridparams=grid_params,
|
|
360
|
-
n_jobs=4,
|
|
361
|
-
grid_iter=5,
|
|
362
|
-
sim_strategy="nonrandom_weighted",
|
|
363
|
-
sim_prop_missing=0.4,
|
|
364
|
-
scoring_metric="precision_recall_macro",
|
|
365
|
-
gridsearch_method="randomized_gridsearch",
|
|
366
|
-
early_stop_gen=5,
|
|
367
|
-
# sample_weights={0: 1.0, 1: 0.0, 2: 1.0},
|
|
368
|
-
# sample_weights="auto",
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
# ubp = ImputeVAE(
|
|
372
|
-
# data,
|
|
373
|
-
# # gridparams=grid_params,
|
|
374
|
-
# # initial_strategy="populations",
|
|
375
|
-
# # disable_progressbar=True,
|
|
376
|
-
# # cv=3,
|
|
377
|
-
# # column_subset=1.0,
|
|
378
|
-
# # validation_size=0.3,
|
|
379
|
-
# # learning_rate=0.1,
|
|
380
|
-
# # num_hidden_layers=1,
|
|
381
|
-
# # verbose=1,
|
|
382
|
-
# # gridparams=grid_params,
|
|
383
|
-
# )
|
|
384
|
-
|
|
385
|
-
# nlpca_data = nlpca.imputed
|
|
386
|
-
# print(nlpca_data.genotypes012_df)
|
|
387
|
-
|
|
388
|
-
# print(data.genotypes012_df)
|
|
389
|
-
# print(nlpca_data.genotypes012_df)
|
|
390
|
-
|
|
391
|
-
# ubp = ImputeUBP(
|
|
392
|
-
# genotype_data=data,
|
|
393
|
-
# test_categorical=np.array([[0, 1], [-9, 1], [2, -9]]),
|
|
394
|
-
# )
|
|
395
|
-
|
|
396
|
-
# ubp = ImputeVAE(
|
|
397
|
-
# gt=np.array([[0, 1], [-9, 1], [2, -9]]),
|
|
398
|
-
# initial_strategy="most_frequent",
|
|
399
|
-
# )
|
|
400
|
-
|
|
401
|
-
# br_imp = ImputeBayesianRidge(
|
|
402
|
-
# data,
|
|
403
|
-
# prefix=args.prefix,
|
|
404
|
-
# alpha_1=0.0002689638465560243,
|
|
405
|
-
# alpha_2=0.0001473822173361299,
|
|
406
|
-
# lambda_1=0.0003281735206234651,
|
|
407
|
-
# lambda_2=0.00020767920087590963,
|
|
408
|
-
# n_iter=100,
|
|
409
|
-
# n_nearest_features=3,
|
|
410
|
-
# progress_update_percent=20,
|
|
411
|
-
# disable_progressbar=True,
|
|
412
|
-
# max_iter=2,
|
|
413
|
-
# cv=3,
|
|
414
|
-
# initial_strategy="group_mode",
|
|
415
|
-
# )
|
|
416
|
-
|
|
417
|
-
# phylo = ImputePhylo(
|
|
418
|
-
# genotype_data=data, save_plots=False, disable_progressbar=True
|
|
419
|
-
# )
|
|
420
|
-
|
|
421
|
-
# phylodata = phylo.imputed
|
|
422
|
-
# print(phylodata.genotypes012_df)
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
def get_arguments():
|
|
426
|
-
"""[Parse command-line arguments. Imported with argparse]
|
|
427
|
-
|
|
428
|
-
Returns:
|
|
429
|
-
[argparse object]: [contains command-line arguments; accessed as method]
|
|
430
|
-
"""
|
|
431
|
-
|
|
432
|
-
parser = argparse.ArgumentParser(
|
|
433
|
-
description="Machine learning missing data imputation and species delimitation",
|
|
434
|
-
add_help=False,
|
|
435
|
-
)
|
|
436
|
-
|
|
437
|
-
required_args = parser.add_argument_group("Required arguments")
|
|
438
|
-
filetype_args = parser.add_argument_group(
|
|
439
|
-
"File type arguments (choose only one)"
|
|
440
|
-
)
|
|
441
|
-
structure_args = parser.add_argument_group("Structure file arguments")
|
|
442
|
-
optional_args = parser.add_argument_group("Optional arguments")
|
|
443
|
-
|
|
444
|
-
# File Type arguments
|
|
445
|
-
filetype_args.add_argument(
|
|
446
|
-
"-s", "--str", type=str, required=False, help="Input structure file"
|
|
447
|
-
)
|
|
448
|
-
filetype_args.add_argument(
|
|
449
|
-
"-p", "--phylip", type=str, required=False, help="Input phylip file"
|
|
450
|
-
)
|
|
451
|
-
|
|
452
|
-
filetype_args.add_argument(
|
|
453
|
-
"-t",
|
|
454
|
-
"--treefile",
|
|
455
|
-
type=str,
|
|
456
|
-
required=False,
|
|
457
|
-
default=None,
|
|
458
|
-
help="Newick-formatted treefile",
|
|
459
|
-
)
|
|
460
|
-
|
|
461
|
-
filetype_args.add_argument(
|
|
462
|
-
"-i",
|
|
463
|
-
"--iqtree",
|
|
464
|
-
type=str,
|
|
465
|
-
required=False,
|
|
466
|
-
help=".iqtree output file containing Rate Matrix Q",
|
|
467
|
-
)
|
|
468
|
-
|
|
469
|
-
# Structure Arguments
|
|
470
|
-
structure_args.add_argument(
|
|
471
|
-
"--onerow_perind",
|
|
472
|
-
default=False,
|
|
473
|
-
action="store_true",
|
|
474
|
-
help="Toggles on one row per individual option in structure file",
|
|
475
|
-
)
|
|
476
|
-
structure_args.add_argument(
|
|
477
|
-
"--pop_ids",
|
|
478
|
-
default=False,
|
|
479
|
-
required=False,
|
|
480
|
-
action="store_true",
|
|
481
|
-
help="Toggles on population ID column (2nd col) in structure file",
|
|
482
|
-
)
|
|
483
|
-
|
|
484
|
-
## Optional Arguments
|
|
485
|
-
optional_args.add_argument(
|
|
486
|
-
"-m",
|
|
487
|
-
"--popmap",
|
|
488
|
-
type=str,
|
|
489
|
-
required=False,
|
|
490
|
-
default=None,
|
|
491
|
-
help="Two-column tab-separated population map file: inds\tpops. No header line",
|
|
492
|
-
)
|
|
493
|
-
optional_args.add_argument(
|
|
494
|
-
"--prefix",
|
|
495
|
-
type=str,
|
|
496
|
-
required=False,
|
|
497
|
-
default="output",
|
|
498
|
-
help="Prefix for output files",
|
|
499
|
-
)
|
|
500
|
-
|
|
501
|
-
optional_args.add_argument(
|
|
502
|
-
"--resume_imputed",
|
|
503
|
-
type=str,
|
|
504
|
-
required=False,
|
|
505
|
-
help="Read in imputed data from a file instead of doing the imputation",
|
|
506
|
-
)
|
|
507
|
-
# Add help menu
|
|
508
|
-
optional_args.add_argument(
|
|
509
|
-
"-h", "--help", action="help", help="Displays this help menu"
|
|
510
|
-
)
|
|
511
|
-
|
|
512
|
-
# If no command-line arguments are called then exit and call help menu.
|
|
513
|
-
if len(sys.argv) == 1:
|
|
514
|
-
print("\nExiting because no command-line options were called.\n")
|
|
515
|
-
parser.print_help(sys.stderr)
|
|
516
|
-
sys.exit(1)
|
|
517
|
-
|
|
518
|
-
args = parser.parse_args()
|
|
519
|
-
return args
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
if __name__ == "__main__":
|
|
523
|
-
main()
|
test/test.py
DELETED
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import os
|
|
3
|
-
import copy
|
|
4
|
-
import unittest
|
|
5
|
-
import pprint
|
|
6
|
-
from snpio import GenotypeData
|
|
7
|
-
from pgsui import *
|
|
8
|
-
from pgsui.utils.misc import HiddenPrints
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestMyClasses(unittest.TestCase):
|
|
12
|
-
def setUp(self):
|
|
13
|
-
with HiddenPrints():
|
|
14
|
-
self.genotype_data = GenotypeData(
|
|
15
|
-
filename="pgsui/example_data/phylip_files/test_n100.phy",
|
|
16
|
-
popmapfile="pgsui/example_data/popmaps/test.popmap",
|
|
17
|
-
guidetree="pgsui/example_data/trees/test.tre",
|
|
18
|
-
qmatrix="pgsui/example_data/trees/test.qmat",
|
|
19
|
-
siterates="pgsui/example_data/trees/test_siterates_n100.txt",
|
|
20
|
-
prefix="test_imputer",
|
|
21
|
-
force_popmap=True,
|
|
22
|
-
plot_format="png",
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
# Create a SimGenotypeDataTransformer instance and use it
|
|
26
|
-
# to simulate missing data
|
|
27
|
-
self.transformer = SimGenotypeDataTransformer(
|
|
28
|
-
genotype_data=self.genotype_data,
|
|
29
|
-
prop_missing=0.2,
|
|
30
|
-
strategy="random",
|
|
31
|
-
)
|
|
32
|
-
self.transformer.fit(self.genotype_data.genotypes_012(fmt="numpy"))
|
|
33
|
-
self.simulated_data = copy.deepcopy(self.genotype_data)
|
|
34
|
-
|
|
35
|
-
self.simulated_data.genotypes_012 = self.transformer.transform(
|
|
36
|
-
self.genotype_data.genotypes_012(fmt="numpy")
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
def _test_class(self, class_instance, do_gridsearch=False):
|
|
40
|
-
print(f"\nMETHOD: {class_instance.__name__}\n")
|
|
41
|
-
|
|
42
|
-
if do_gridsearch:
|
|
43
|
-
# Do a simple test.
|
|
44
|
-
if class_instance in [ImputeRandomForest, ImputeXGBoost]:
|
|
45
|
-
param_grid = {"n_estimators": [50, 100]} # Do a simple test
|
|
46
|
-
elif class_instance in [
|
|
47
|
-
ImputeVAE,
|
|
48
|
-
ImputeStandardAutoEncoder,
|
|
49
|
-
ImputeNLPCA,
|
|
50
|
-
ImputeUBP,
|
|
51
|
-
]:
|
|
52
|
-
param_grid = {"dropout_rate": [0.1, 0.2]}
|
|
53
|
-
elif class_instance == ImputeKNN:
|
|
54
|
-
param_grid = {"n_neighbors": [5, 8]}
|
|
55
|
-
else:
|
|
56
|
-
param_grid = None
|
|
57
|
-
|
|
58
|
-
instance = class_instance(
|
|
59
|
-
self.simulated_data,
|
|
60
|
-
gridparams=param_grid,
|
|
61
|
-
)
|
|
62
|
-
imputed_data = instance.imputed.genotypes_012(fmt="numpy")
|
|
63
|
-
|
|
64
|
-
# Test that the imputed values are close to the original values
|
|
65
|
-
accuracy = self.transformer.accuracy(
|
|
66
|
-
self.genotype_data.genotypes_012(fmt="numpy"), imputed_data
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
(
|
|
70
|
-
auc_roc_scores,
|
|
71
|
-
precision_scores,
|
|
72
|
-
recall_scores,
|
|
73
|
-
avg_precision_scores,
|
|
74
|
-
) = self.transformer.auc_roc_pr_ap(
|
|
75
|
-
self.genotype_data.genotypes_012(fmt="numpy"), imputed_data
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
79
|
-
f"OVERALL ACCURACY: {accuracy}"
|
|
80
|
-
)
|
|
81
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
82
|
-
f"AUC-ROC PER CLASS: {dict(zip(range(3), auc_roc_scores))}"
|
|
83
|
-
)
|
|
84
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
85
|
-
f"PRECISION PER CLASS: {dict(zip(range(3), precision_scores))}"
|
|
86
|
-
)
|
|
87
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
88
|
-
f"RECALL PER CLASS: {dict(zip(range(3), recall_scores))}"
|
|
89
|
-
)
|
|
90
|
-
pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
|
|
91
|
-
f"AVERAGE PRECISION PER CLASS: {dict(zip(range(3), avg_precision_scores))}"
|
|
92
|
-
)
|
|
93
|
-
print("\n")
|
|
94
|
-
|
|
95
|
-
def test_ImputeKNN(self):
|
|
96
|
-
self._test_class(ImputeKNN)
|
|
97
|
-
|
|
98
|
-
def test_ImputeRandomForest(self):
|
|
99
|
-
self._test_class(ImputeRandomForest)
|
|
100
|
-
|
|
101
|
-
def test_ImputeXGBoost(self):
|
|
102
|
-
self._test_class(ImputeXGBoost)
|
|
103
|
-
|
|
104
|
-
def test_ImputeVAE(self):
|
|
105
|
-
self._test_class(ImputeVAE)
|
|
106
|
-
|
|
107
|
-
def test_ImputeStandardAutoEncoder(self):
|
|
108
|
-
self._test_class(ImputeStandardAutoEncoder)
|
|
109
|
-
|
|
110
|
-
def test_ImputeUBP(self):
|
|
111
|
-
self._test_class(ImputeUBP)
|
|
112
|
-
|
|
113
|
-
def test_ImputeNLPCA(self):
|
|
114
|
-
self._test_class(ImputeNLPCA)
|
|
115
|
-
|
|
116
|
-
def test_ImputeKNN_grid(self):
|
|
117
|
-
self._test_class(ImputeKNN, do_gridsearch=True)
|
|
118
|
-
|
|
119
|
-
def test_ImputeRandomForest_grid(self):
|
|
120
|
-
self._test_class(ImputeRandomForest, do_gridsearch=True)
|
|
121
|
-
|
|
122
|
-
def test_ImputeXGBoost_grid(self):
|
|
123
|
-
self._test_class(ImputeXGBoost, do_gridsearch=True)
|
|
124
|
-
|
|
125
|
-
def test_ImputeVAE_grid(self):
|
|
126
|
-
self._test_class(ImputeVAE, do_gridsearch=True)
|
|
127
|
-
|
|
128
|
-
def test_ImputeStandardAutoEncoder_grid(self):
|
|
129
|
-
self._test_class(ImputeStandardAutoEncoder, do_gridsearch=True)
|
|
130
|
-
|
|
131
|
-
def test_ImputeUBP_grid(self):
|
|
132
|
-
self._test_class(ImputeUBP, do_gridsearch=True)
|
|
133
|
-
|
|
134
|
-
def test_ImputeNLPCA_grid(self):
|
|
135
|
-
self._test_class(ImputeNLPCA, do_gridsearch=True)
|
|
136
|
-
|
|
137
|
-
def test_ImputePhylo(self):
|
|
138
|
-
self._test_class(ImputePhylo)
|
|
139
|
-
|
|
140
|
-
def test_ImputeAlleleFreq(self):
|
|
141
|
-
self._test_class(ImputeAlleleFreq)
|
|
142
|
-
|
|
143
|
-
def test_ImputeMF(self):
|
|
144
|
-
self._test_class(ImputeMF)
|
|
145
|
-
|
|
146
|
-
def test_ImputeRefAllele(self):
|
|
147
|
-
self._test_class(ImputeRefAllele)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if __name__ == "__main__":
|
|
151
|
-
unittest.main()
|