pg-sui 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +99 -77
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.3.dist-info/RECORD +0 -75
  83. pg_sui-0.2.3.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
test/pg_sui_testing.py DELETED
@@ -1,523 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- # Standard library imports
4
- import argparse
5
- import sys
6
-
7
- import numpy as np
8
- import pandas as pd
9
- import scipy.stats as stats
10
-
11
- from sklearn_genetic.space import Continuous, Categorical, Integer
12
-
13
- # from pgsui import GenotypeData
14
- from snpio import GenotypeData
15
- from impute.estimators import (
16
- ImputeNLPCA,
17
- ImputeUBP,
18
- ImputeRandomForest,
19
- ImputeVAE,
20
- )
21
- from impute.simple_imputers import ImputePhylo
22
-
23
- # from snpio import GenotypeData
24
- # from impute.estimators import *
25
- # from impute.simple_imputers import ImputeAlleleFreq, ImputePhylo
26
-
27
- # from read_input import GenotypeData
28
- # from estimators import *
29
-
30
-
31
- def main():
32
- """Class instantiations and main package body"""
33
-
34
- args = get_arguments()
35
-
36
- if args.str and args.phylip:
37
- sys.exit("Error: Only one file type can be specified")
38
-
39
- # If VCF file is specified.
40
- if args.str:
41
- if not args.pop_ids and args.popmap is None:
42
- raise TypeError("Either --pop_ids or --popmap must be specified\n")
43
-
44
- if args.pop_ids:
45
- print("\n--pop_ids was specified as column 2\n")
46
- else:
47
- print(
48
- "\n--pop_ids was not specified; "
49
- "using popmap file to get population IDs\n"
50
- )
51
-
52
- if args.onerow_perind:
53
- print("\nUsing one row per individual...\n")
54
- else:
55
- print("\nUsing two rows per individual...\n")
56
-
57
- if args.onerow_perind:
58
- data = GenotypeData(
59
- filename=args.str,
60
- filetype="structure1row",
61
- popmapfile=args.popmap,
62
- guidetree=args.treefile,
63
- qmatrix_iqtree=args.iqtree,
64
- )
65
- else:
66
- data = GenotypeData(
67
- filename=args.str,
68
- filetype="structure2row",
69
- popmapfile=args.popmap,
70
- guidetree=args.treefile,
71
- qmatrix_iqtree=args.iqtree,
72
- )
73
-
74
- if args.phylip:
75
- if args.pop_ids or args.onerow_perind:
76
- print(
77
- "\nPhylip file was used with structure arguments; ignoring "
78
- "structure file arguments\n"
79
- )
80
-
81
- if args.popmap is None:
82
- raise TypeError("No popmap file supplied with PHYLIP file\n")
83
-
84
- data = GenotypeData(
85
- filename=args.phylip,
86
- filetype="phylip",
87
- popmapfile=args.popmap,
88
- guidetree=args.treefile,
89
- qmatrix_iqtree=args.iqtree,
90
- siterates_iqtree="pgsui/example_data/trees/test_n10.rate",
91
- )
92
-
93
- if args.resume_imputed:
94
- pass
95
- # data.read_imputed(args.resume_imputed, impute_methods="rf")
96
- # data.write_imputed(data.imputed_rf_df, args.prefix)
97
-
98
- else:
99
- # For randomizedsearchcv
100
- # Number of trees in random forest
101
- n_estimators = [
102
- int(x) for x in np.linspace(start=100, stop=1000, num=10)
103
- ]
104
-
105
- # Number of features to consider at every split
106
- max_features = ["sqrt", "log2"]
107
-
108
- # Maximum number of levels in the tree
109
- max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
110
- max_depth.append(None)
111
-
112
- # Minimmum number of samples required to split a node
113
- min_samples_split = [int(x) for x in np.linspace(2, 10, num=5)]
114
-
115
- # Minimum number of samples required at each leaf node
116
- min_samples_leaf = [int(x) for x in np.linspace(1, 5, num=5)]
117
-
118
- # Proportion of dataset to use with bootstrapping
119
- # max_samples = [x for x in np.linspace(0.5, 1.0, num=6)]
120
-
121
- # # Random Forest gridparams - RandomizedSearchCV
122
- # grid_params = {
123
- # "max_features": max_features,
124
- # "max_depth": max_depth,
125
- # "min_samples_split": min_samples_split,
126
- # "min_samples_leaf": min_samples_leaf,
127
- # }
128
-
129
- # Random Forest gridparams - Genetic Algorithms
130
- # grid_params = {
131
- # "n_estimators": Integer(100, 500),
132
- # "max_features": max_features,
133
- # "max_depth": max_depth,
134
- # "min_samples_split": min_samples_split,
135
- # "min_samples_leaf": min_samples_leaf,
136
- # "max_samples": max_samples
137
- # }
138
-
139
- # # Genetic Algorithm grid_params
140
- # grid_params = {
141
- # "max_features": Categorical(["sqrt", "log2"]),
142
- # "min_samples_split": Integer(2, 10),
143
- # "min_samples_leaf": Integer(1, 10),
144
- # "max_depth": Integer(2, 110),
145
- # }
146
-
147
- # Bayesian Ridge gridparams - RandomizedSearchCV
148
- # grid_params = {
149
- # "alpha_1": stats.loguniform(1e-6, 0.01),
150
- # "alpha_2": stats.loguniform(1e-6, 0.01),
151
- # "lambda_1": stats.loguniform(1e-6, 0.01),
152
- # "lambda_2": stats.loguniform(1e-6, 0.01),
153
- # }
154
-
155
- # # Bayesian Ridge gridparams - Genetic algorithm
156
- # grid_params = {
157
- # "alpha_1": Continuous(1e-6, 1e-3, distribution="log-uniform"),
158
- # "alpha_2": Continuous(1e-6, 1e-3, distribution="log-uniform"),
159
- # "lambda_1": Continuous(1e-6, 1e-3, distribution="log-uniform"),
160
- # "lambda_2": Continuous(1e-6, 1e-3, distribution="log-uniform")
161
- # }
162
-
163
- # # Random forest imputation with genetic algorithm grid search
164
- # rf_imp = ImputeRandomForest(
165
- # data,
166
- # prefix=args.prefix,
167
- # n_estimators=50,
168
- # n_nearest_features=1,
169
- # gridparams=grid_params,
170
- # cv=3,
171
- # grid_iter=40,
172
- # n_jobs=4,
173
- # max_iter=2,
174
- # column_subset=1.0,
175
- # ga=False,
176
- # disable_progressbar=True,
177
- # extratrees=False,
178
- # mutation_probability=0.1,
179
- # progress_update_percent=20,
180
- # chunk_size=1.0,
181
- # initial_strategy="phylogeny",
182
- # )
183
-
184
- # # Genetic Algorithm grid search Test
185
- # rf_imp2 = ImputeRandomForest(
186
- # data,
187
- # prefix=args.prefix,
188
- # n_estimators=50,
189
- # n_nearest_features=2,
190
- # gridparams=grid_params,
191
- # cv=3,
192
- # grid_iter=40,
193
- # n_jobs=-1,
194
- # max_iter=2,
195
- # column_subset=1.0,
196
- # ga=True,
197
- # disable_progressbar=True,
198
- # extratrees=False,
199
- # chunk_size=1.0,
200
- # initial_strategy="phylogeny",
201
- # )
202
-
203
- # rfdata = rf_imp.imputed
204
- # print(rfdata.genotypes012_df)
205
-
206
- # rf_data = rf_imp.imputed
207
- # print(data.genotypes012_df)
208
- # print(rf_data.genotypes012_df)
209
-
210
- # imp_decoded = data.decode_imputed(rf_imp.imputed)
211
- # print(imp_decoded)
212
-
213
- # # RandomizedSearchCV Test
214
- # rf_imp = ImputeRandomForest(
215
- # data,
216
- # prefix=args.prefix,
217
- # n_estimators=50,
218
- # n_nearest_features=3,
219
- # gridparams=grid_params,
220
- # cv=3,
221
- # grid_iter=40,
222
- # n_jobs=4,
223
- # max_iter=2,
224
- # column_subset=5,
225
- # ga=False,
226
- # disable_progressbar=False,
227
- # extratrees=False,
228
- # progress_update_percent=20,
229
- # chunk_size=0.2,
230
- # initial_strategy="phylogeny",
231
- # )
232
-
233
- # lgbm = ImputeLightGBM(
234
- # data,
235
- # prefix=args.prefix,
236
- # cv=3,
237
- # n_jobs=4,
238
- # n_estimators=50,
239
- # disable_progressbar=True,
240
- # chunk_size=0.2,
241
- # validation_only=0.1,
242
- # n_nearest_features=3,
243
- # max_iter=2,
244
- # initial_strategy="populations",
245
- # )
246
-
247
- # vae = ImputeVAE(
248
- # genotype_data=data,
249
- # prefix=args.prefix,
250
- # disable_progressbar=True,
251
- # validation_only=None,
252
- # initial_strategy="populations",
253
- # )
254
-
255
- # vae_gtdata = vae.imputed
256
- # print(vae_gtdata.genotypes012_df)
257
-
258
- # complete_encoded = imputer.train(train_epochs=300, batch_size=256)
259
- # print(complete_encoded)
260
-
261
- # rf_imp = ImputeRandomForest(
262
- # data,
263
- # prefix=args.prefix,
264
- # n_estimators=50,
265
- # n_nearest_features=3,
266
- # n_jobs=4,
267
- # max_iter=2,
268
- # disable_progressbar=True,
269
- # extratrees=False,
270
- # max_features="sqrt",
271
- # min_samples_split=5,
272
- # min_samples_leaf=2,
273
- # max_depth=30,
274
- # cv=3,
275
- # validation_only=0.3,
276
- # chunk_size=1.0,
277
- # initial_strategy="populations",
278
- # )
279
-
280
- # afpops = ImputeAlleleFreq(
281
- # genotype_data=data,
282
- # by_populations=True,
283
- # prefix=args.prefix,
284
- # )
285
-
286
- # print(data.genotypes012_df)
287
- # print(afpops.genotypes012_df)
288
-
289
- # br_imp = ImputeBayesianRidge(data, prefix=args.prefix, n_iter=100, gridparams=grid_params, grid_iter=3, cv=3, n_jobs=4, max_iter=5, n_nearest_features=3, column_subset=4, ga=False, disable_progressbar=True, progress_update_percent=20, chunk_size=1.0)
290
-
291
- # aftestpops = ImputeAlleleFreq(
292
- # genotype_data=data, by_populations=True, prefix=args.prefix
293
- # )
294
-
295
- # aftestpops_data = aftestpops.imputed
296
-
297
- # print(data.genotypes012_df)
298
- # print(aftestpops_data.genotypes012_df)
299
-
300
- # vae = ImputeVAE(
301
- # gt=np.array([[0, 1], [-9, 1], [2, -9]]),
302
- # initial_strategy="most_frequent",
303
- # cv=3,
304
- # validation_only=None,
305
- # )
306
-
307
- # vae_data = vae.imputed
308
-
309
- # print(data.genotypes012_df)
310
- # print(vae_data.genotypes012_df)
311
-
312
- # For GridSearchCV. Generate parameters to sample from.
313
- learning_rate = [float(10) ** x for x in np.arange(-4, 0)]
314
- l1_penalty = [float(10) ** x for x in np.arange(-6, -1)]
315
- l1_penalty.append(0.0)
316
- l2_penalty = [float(10) ** x for x in np.arange(-6, -1)]
317
- l2_penalty.append(0.0)
318
- hidden_activation = ["elu", "relu"]
319
- num_hidden_layers = [1, 2, 3, 4, 5]
320
- hidden_layer_sizes = ["sqrt", "midpoint"]
321
- n_components = [2, 3]
322
- dropout_rate = [round(x, 1) for x in np.arange(0.0, 1.0, 0.1)]
323
- batch_size = [16, 32, 48, 64]
324
- optimizer = ["adam", "sgd", "adagrad"]
325
-
326
- # grid_params = {
327
- # "learning_rate": Continuous(1e-6, 0.1, distribution="log-uniform"),
328
- # "l2_penalty": Continuous(1e-6, 0.01, distribution="uniform"),
329
- # "n_components": Integer(2, 3),
330
- # # "hidden_activation": Categorical(["elu", "relu"]),
331
- # }
332
-
333
- grid_params = {
334
- # "learning_rate": learning_rate,
335
- # "l1_penalty": l1_penalty,
336
- "l2_penalty": l2_penalty,
337
- # "hidden_activation": hidden_activation,
338
- # "hidden_layer_sizes": hidden_layer_sizes,
339
- "n_components": n_components,
340
- # "dropout_rate": dropout_rate,
341
- # "batch_size": batch_size,
342
- # "optimizer": optimizer,
343
- }
344
-
345
- ubp = ImputeUBP(
346
- data,
347
- disable_progressbar=False,
348
- cv=3,
349
- column_subset=1.0,
350
- validation_split=0.0,
351
- learning_rate=0.1,
352
- num_hidden_layers=1,
353
- verbose=1,
354
- dropout_rate=0.2,
355
- hidden_activation="elu",
356
- batch_size=64,
357
- l1_penalty=1e-6,
358
- l2_penalty=1e-6,
359
- gridparams=grid_params,
360
- n_jobs=4,
361
- grid_iter=5,
362
- sim_strategy="nonrandom_weighted",
363
- sim_prop_missing=0.4,
364
- scoring_metric="precision_recall_macro",
365
- gridsearch_method="randomized_gridsearch",
366
- early_stop_gen=5,
367
- # sample_weights={0: 1.0, 1: 0.0, 2: 1.0},
368
- # sample_weights="auto",
369
- )
370
-
371
- # ubp = ImputeVAE(
372
- # data,
373
- # # gridparams=grid_params,
374
- # # initial_strategy="populations",
375
- # # disable_progressbar=True,
376
- # # cv=3,
377
- # # column_subset=1.0,
378
- # # validation_size=0.3,
379
- # # learning_rate=0.1,
380
- # # num_hidden_layers=1,
381
- # # verbose=1,
382
- # # gridparams=grid_params,
383
- # )
384
-
385
- # nlpca_data = nlpca.imputed
386
- # print(nlpca_data.genotypes012_df)
387
-
388
- # print(data.genotypes012_df)
389
- # print(nlpca_data.genotypes012_df)
390
-
391
- # ubp = ImputeUBP(
392
- # genotype_data=data,
393
- # test_categorical=np.array([[0, 1], [-9, 1], [2, -9]]),
394
- # )
395
-
396
- # ubp = ImputeVAE(
397
- # gt=np.array([[0, 1], [-9, 1], [2, -9]]),
398
- # initial_strategy="most_frequent",
399
- # )
400
-
401
- # br_imp = ImputeBayesianRidge(
402
- # data,
403
- # prefix=args.prefix,
404
- # alpha_1=0.0002689638465560243,
405
- # alpha_2=0.0001473822173361299,
406
- # lambda_1=0.0003281735206234651,
407
- # lambda_2=0.00020767920087590963,
408
- # n_iter=100,
409
- # n_nearest_features=3,
410
- # progress_update_percent=20,
411
- # disable_progressbar=True,
412
- # max_iter=2,
413
- # cv=3,
414
- # initial_strategy="group_mode",
415
- # )
416
-
417
- # phylo = ImputePhylo(
418
- # genotype_data=data, save_plots=False, disable_progressbar=True
419
- # )
420
-
421
- # phylodata = phylo.imputed
422
- # print(phylodata.genotypes012_df)
423
-
424
-
425
- def get_arguments():
426
- """[Parse command-line arguments. Imported with argparse]
427
-
428
- Returns:
429
- [argparse object]: [contains command-line arguments; accessed as method]
430
- """
431
-
432
- parser = argparse.ArgumentParser(
433
- description="Machine learning missing data imputation and species delimitation",
434
- add_help=False,
435
- )
436
-
437
- required_args = parser.add_argument_group("Required arguments")
438
- filetype_args = parser.add_argument_group(
439
- "File type arguments (choose only one)"
440
- )
441
- structure_args = parser.add_argument_group("Structure file arguments")
442
- optional_args = parser.add_argument_group("Optional arguments")
443
-
444
- # File Type arguments
445
- filetype_args.add_argument(
446
- "-s", "--str", type=str, required=False, help="Input structure file"
447
- )
448
- filetype_args.add_argument(
449
- "-p", "--phylip", type=str, required=False, help="Input phylip file"
450
- )
451
-
452
- filetype_args.add_argument(
453
- "-t",
454
- "--treefile",
455
- type=str,
456
- required=False,
457
- default=None,
458
- help="Newick-formatted treefile",
459
- )
460
-
461
- filetype_args.add_argument(
462
- "-i",
463
- "--iqtree",
464
- type=str,
465
- required=False,
466
- help=".iqtree output file containing Rate Matrix Q",
467
- )
468
-
469
- # Structure Arguments
470
- structure_args.add_argument(
471
- "--onerow_perind",
472
- default=False,
473
- action="store_true",
474
- help="Toggles on one row per individual option in structure file",
475
- )
476
- structure_args.add_argument(
477
- "--pop_ids",
478
- default=False,
479
- required=False,
480
- action="store_true",
481
- help="Toggles on population ID column (2nd col) in structure file",
482
- )
483
-
484
- ## Optional Arguments
485
- optional_args.add_argument(
486
- "-m",
487
- "--popmap",
488
- type=str,
489
- required=False,
490
- default=None,
491
- help="Two-column tab-separated population map file: inds\tpops. No header line",
492
- )
493
- optional_args.add_argument(
494
- "--prefix",
495
- type=str,
496
- required=False,
497
- default="output",
498
- help="Prefix for output files",
499
- )
500
-
501
- optional_args.add_argument(
502
- "--resume_imputed",
503
- type=str,
504
- required=False,
505
- help="Read in imputed data from a file instead of doing the imputation",
506
- )
507
- # Add help menu
508
- optional_args.add_argument(
509
- "-h", "--help", action="help", help="Displays this help menu"
510
- )
511
-
512
- # If no command-line arguments are called then exit and call help menu.
513
- if len(sys.argv) == 1:
514
- print("\nExiting because no command-line options were called.\n")
515
- parser.print_help(sys.stderr)
516
- sys.exit(1)
517
-
518
- args = parser.parse_args()
519
- return args
520
-
521
-
522
- if __name__ == "__main__":
523
- main()
test/test.py DELETED
@@ -1,151 +0,0 @@
1
- import sys
2
- import os
3
- import copy
4
- import unittest
5
- import pprint
6
- from snpio import GenotypeData
7
- from pgsui import *
8
- from pgsui.utils.misc import HiddenPrints
9
-
10
-
11
- class TestMyClasses(unittest.TestCase):
12
- def setUp(self):
13
- with HiddenPrints():
14
- self.genotype_data = GenotypeData(
15
- filename="pgsui/example_data/phylip_files/test_n100.phy",
16
- popmapfile="pgsui/example_data/popmaps/test.popmap",
17
- guidetree="pgsui/example_data/trees/test.tre",
18
- qmatrix="pgsui/example_data/trees/test.qmat",
19
- siterates="pgsui/example_data/trees/test_siterates_n100.txt",
20
- prefix="test_imputer",
21
- force_popmap=True,
22
- plot_format="png",
23
- )
24
-
25
- # Create a SimGenotypeDataTransformer instance and use it
26
- # to simulate missing data
27
- self.transformer = SimGenotypeDataTransformer(
28
- genotype_data=self.genotype_data,
29
- prop_missing=0.2,
30
- strategy="random",
31
- )
32
- self.transformer.fit(self.genotype_data.genotypes_012(fmt="numpy"))
33
- self.simulated_data = copy.deepcopy(self.genotype_data)
34
-
35
- self.simulated_data.genotypes_012 = self.transformer.transform(
36
- self.genotype_data.genotypes_012(fmt="numpy")
37
- )
38
-
39
- def _test_class(self, class_instance, do_gridsearch=False):
40
- print(f"\nMETHOD: {class_instance.__name__}\n")
41
-
42
- if do_gridsearch:
43
- # Do a simple test.
44
- if class_instance in [ImputeRandomForest, ImputeXGBoost]:
45
- param_grid = {"n_estimators": [50, 100]} # Do a simple test
46
- elif class_instance in [
47
- ImputeVAE,
48
- ImputeStandardAutoEncoder,
49
- ImputeNLPCA,
50
- ImputeUBP,
51
- ]:
52
- param_grid = {"dropout_rate": [0.1, 0.2]}
53
- elif class_instance == ImputeKNN:
54
- param_grid = {"n_neighbors": [5, 8]}
55
- else:
56
- param_grid = None
57
-
58
- instance = class_instance(
59
- self.simulated_data,
60
- gridparams=param_grid,
61
- )
62
- imputed_data = instance.imputed.genotypes_012(fmt="numpy")
63
-
64
- # Test that the imputed values are close to the original values
65
- accuracy = self.transformer.accuracy(
66
- self.genotype_data.genotypes_012(fmt="numpy"), imputed_data
67
- )
68
-
69
- (
70
- auc_roc_scores,
71
- precision_scores,
72
- recall_scores,
73
- avg_precision_scores,
74
- ) = self.transformer.auc_roc_pr_ap(
75
- self.genotype_data.genotypes_012(fmt="numpy"), imputed_data
76
- )
77
-
78
- pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
79
- f"OVERALL ACCURACY: {accuracy}"
80
- )
81
- pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
82
- f"AUC-ROC PER CLASS: {dict(zip(range(3), auc_roc_scores))}"
83
- )
84
- pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
85
- f"PRECISION PER CLASS: {dict(zip(range(3), precision_scores))}"
86
- )
87
- pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
88
- f"RECALL PER CLASS: {dict(zip(range(3), recall_scores))}"
89
- )
90
- pprint.PrettyPrinter(indent=4, sort_dicts=True).pprint(
91
- f"AVERAGE PRECISION PER CLASS: {dict(zip(range(3), avg_precision_scores))}"
92
- )
93
- print("\n")
94
-
95
- def test_ImputeKNN(self):
96
- self._test_class(ImputeKNN)
97
-
98
- def test_ImputeRandomForest(self):
99
- self._test_class(ImputeRandomForest)
100
-
101
- def test_ImputeXGBoost(self):
102
- self._test_class(ImputeXGBoost)
103
-
104
- def test_ImputeVAE(self):
105
- self._test_class(ImputeVAE)
106
-
107
- def test_ImputeStandardAutoEncoder(self):
108
- self._test_class(ImputeStandardAutoEncoder)
109
-
110
- def test_ImputeUBP(self):
111
- self._test_class(ImputeUBP)
112
-
113
- def test_ImputeNLPCA(self):
114
- self._test_class(ImputeNLPCA)
115
-
116
- def test_ImputeKNN_grid(self):
117
- self._test_class(ImputeKNN, do_gridsearch=True)
118
-
119
- def test_ImputeRandomForest_grid(self):
120
- self._test_class(ImputeRandomForest, do_gridsearch=True)
121
-
122
- def test_ImputeXGBoost_grid(self):
123
- self._test_class(ImputeXGBoost, do_gridsearch=True)
124
-
125
- def test_ImputeVAE_grid(self):
126
- self._test_class(ImputeVAE, do_gridsearch=True)
127
-
128
- def test_ImputeStandardAutoEncoder_grid(self):
129
- self._test_class(ImputeStandardAutoEncoder, do_gridsearch=True)
130
-
131
- def test_ImputeUBP_grid(self):
132
- self._test_class(ImputeUBP, do_gridsearch=True)
133
-
134
- def test_ImputeNLPCA_grid(self):
135
- self._test_class(ImputeNLPCA, do_gridsearch=True)
136
-
137
- def test_ImputePhylo(self):
138
- self._test_class(ImputePhylo)
139
-
140
- def test_ImputeAlleleFreq(self):
141
- self._test_class(ImputeAlleleFreq)
142
-
143
- def test_ImputeMF(self):
144
- self._test_class(ImputeMF)
145
-
146
- def test_ImputeRefAllele(self):
147
- self._test_class(ImputeRefAllele)
148
-
149
-
150
- if __name__ == "__main__":
151
- unittest.main()