pg-sui 0.2.3__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +99 -77
  2. pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
  3. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
  5. {pg_sui-0.2.3.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
  6. pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
  7. pgsui/__init__.py +35 -54
  8. pgsui/_version.py +34 -0
  9. pgsui/cli.py +909 -0
  10. pgsui/data_processing/__init__.py +0 -0
  11. pgsui/data_processing/config.py +565 -0
  12. pgsui/data_processing/containers.py +1424 -0
  13. pgsui/data_processing/transformers.py +557 -907
  14. pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  15. pgsui/electron/app/__main__.py +5 -0
  16. pgsui/electron/app/extra-resources/.gitkeep +1 -0
  17. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  18. pgsui/electron/app/icons/icons/128x128.png +0 -0
  19. pgsui/electron/app/icons/icons/16x16.png +0 -0
  20. pgsui/electron/app/icons/icons/24x24.png +0 -0
  21. pgsui/electron/app/icons/icons/256x256.png +0 -0
  22. pgsui/electron/app/icons/icons/32x32.png +0 -0
  23. pgsui/electron/app/icons/icons/48x48.png +0 -0
  24. pgsui/electron/app/icons/icons/512x512.png +0 -0
  25. pgsui/electron/app/icons/icons/64x64.png +0 -0
  26. pgsui/electron/app/icons/icons/icon.icns +0 -0
  27. pgsui/electron/app/icons/icons/icon.ico +0 -0
  28. pgsui/electron/app/main.js +227 -0
  29. pgsui/electron/app/package-lock.json +6894 -0
  30. pgsui/electron/app/package.json +51 -0
  31. pgsui/electron/app/preload.js +15 -0
  32. pgsui/electron/app/server.py +157 -0
  33. pgsui/electron/app/ui/logo.png +0 -0
  34. pgsui/electron/app/ui/renderer.js +131 -0
  35. pgsui/electron/app/ui/styles.css +59 -0
  36. pgsui/electron/app/ui/ui_shim.js +72 -0
  37. pgsui/electron/bootstrap.py +43 -0
  38. pgsui/electron/launch.py +57 -0
  39. pgsui/electron/package.json +14 -0
  40. pgsui/example_data/__init__.py +0 -0
  41. pgsui/example_data/phylip_files/__init__.py +0 -0
  42. pgsui/example_data/phylip_files/test.phy +0 -0
  43. pgsui/example_data/popmaps/__init__.py +0 -0
  44. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  45. pgsui/example_data/structure_files/__init__.py +0 -0
  46. pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
  47. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  48. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  49. pgsui/impute/__init__.py +0 -0
  50. pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
  51. pgsui/impute/deterministic/imputers/mode.py +844 -0
  52. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  53. pgsui/impute/deterministic/imputers/phylo.py +973 -0
  54. pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
  55. pgsui/impute/supervised/__init__.py +0 -0
  56. pgsui/impute/supervised/base.py +343 -0
  57. pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  58. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
  59. pgsui/impute/supervised/imputers/random_forest.py +291 -0
  60. pgsui/impute/unsupervised/__init__.py +0 -0
  61. pgsui/impute/unsupervised/base.py +1118 -0
  62. pgsui/impute/unsupervised/callbacks.py +92 -262
  63. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
  64. pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
  65. pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
  66. pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
  67. pgsui/impute/unsupervised/imputers/vae.py +1228 -0
  68. pgsui/impute/unsupervised/loss_functions.py +261 -0
  69. pgsui/impute/unsupervised/models/__init__.py +0 -0
  70. pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
  71. pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
  72. pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
  73. pgsui/impute/unsupervised/models/vae_model.py +269 -630
  74. pgsui/impute/unsupervised/nn_scorers.py +255 -0
  75. pgsui/utils/__init__.py +0 -0
  76. pgsui/utils/classification_viz.py +608 -0
  77. pgsui/utils/logging_utils.py +22 -0
  78. pgsui/utils/misc.py +35 -480
  79. pgsui/utils/plotting.py +996 -829
  80. pgsui/utils/pretty_metrics.py +290 -0
  81. pgsui/utils/scorers.py +213 -666
  82. pg_sui-0.2.3.dist-info/RECORD +0 -75
  83. pg_sui-0.2.3.dist-info/top_level.txt +0 -3
  84. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  85. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  86. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  87. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  88. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  89. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  90. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  91. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  92. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  93. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  94. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  95. pgsui/example_data/trees/test.iqtree +0 -376
  96. pgsui/example_data/trees/test.qmat +0 -5
  97. pgsui/example_data/trees/test.rate +0 -2033
  98. pgsui/example_data/trees/test.tre +0 -1
  99. pgsui/example_data/trees/test_n10.rate +0 -19
  100. pgsui/example_data/trees/test_n100.rate +0 -109
  101. pgsui/example_data/trees/test_n500.rate +0 -509
  102. pgsui/example_data/trees/test_siterates.txt +0 -2024
  103. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  104. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  105. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  106. pgsui/example_data/vcf_files/test.vcf +0 -244
  107. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  108. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  109. pgsui/impute/estimators.py +0 -1268
  110. pgsui/impute/impute.py +0 -1463
  111. pgsui/impute/simple_imputers.py +0 -1431
  112. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
  113. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
  114. pgsui/impute/unsupervised/keras_classifiers.py +0 -697
  115. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  116. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
  117. pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
  118. pgsui/pg_sui.py +0 -261
  119. pgsui/utils/sequence_tools.py +0 -407
  120. simulation/sim_benchmarks.py +0 -333
  121. simulation/sim_treeparams.py +0 -475
  122. test/__init__.py +0 -0
  123. test/pg_sui_simtest.py +0 -215
  124. test/pg_sui_testing.py +0 -523
  125. test/test.py +0 -151
  126. test/test_pgsui.py +0 -374
  127. test/test_tkc.py +0 -185
@@ -1,475 +0,0 @@
1
- #!/usr/bin/env python
2
- import sys
3
- import os
4
- import subprocess
5
- import errno
6
-
7
- import toytree
8
- import toyplot.pdf
9
- import pyvolve
10
- import copy
11
- import random
12
- import re
13
- import numpy as np
14
- import pandas as pd
15
-
16
- import matplotlib.pyplot as plt
17
-
18
- def main():
19
- """
20
- Using pyvolve and toytree to simulate data for PG-SUI
21
-
22
- Two ways to run:
23
- - Simulate SNPs along a 'pseudo-chromosome' from which SNPs are sampled
24
- - Simulate genes/ loci, sampling SNPs separately for each
25
-
26
- Pseudo-chromosome(s):
27
- Set num_loci = 1 (or number of desired chromosomes)
28
- loc_length = chromosome length (e.g., 50000)
29
- snps_per_locus = # total snps you want (e.g., 1000)
30
- make_gene_trees = False
31
-
32
- Separate loci (e.g., you want gene trees)
33
- Set num_loci = number of genes/ loci (e.g., 1000)
34
- loc_length = locus length (e.g., 500 or 1000)
35
- snps_per_locus = 1 (usually, but could be higher)
36
- make_gene_trees = True (if you want gene trees)
37
-
38
- If you just want a SNP matrix for testing PG-SUI, option 1
39
- is faster, and functionally not very different given the simulation
40
- model has no explicit mechanism of linkage (i.e. so site independence
41
- is true regardless). The second option will create
42
- a greater amount of rate heterogeneity, since each locus will be
43
- initialized with its own rate matrix (GTR or GTR+Gamma). In either case,
44
- setting write_gene_alignments = True will create sub-directories called
45
- 'full_alignments/' containing the full sequences from with snps_per_locus
46
- number of SNPs will be sampled (without replacement) and concatenated to
47
- create the final outputs.
48
-
49
- NOTE: This script is not a part of the PG-SUI API, and is written
50
- for a single purpose, i.e., is not generalized beyond some options
51
- which can be manually set below. It is intended to provide transparency
52
- for the simulation process used in the PG-SUI manuscript, *not* as
53
- a flexible/ portable tool -- meaning a lot of things are hard-coded.
54
-
55
- """
56
- seed=1234
57
- random.seed(seed)
58
-
59
- num_clades=4
60
- samples_per_clade=20
61
- num_loci=1000
62
- loc_length=250
63
- write_gene_alignments=False
64
- make_gene_trees=False
65
- make_guidetrees=True #set to true to run IQTREE on simulated SNP matrices
66
- keep_all=False #set to true to keep ALL iqtree outputs
67
- keep_report=True #set to true to keep .iqtree files
68
- get_siterates=True #set to true to infer site-specific rates in IQTREE
69
- snps_per_locus=1
70
- iqtree_bin="iqtree2"
71
- get_rates=True
72
- iq_procs=4
73
-
74
- ###################
75
-
76
- if get_siterates and not make_guidetrees:
77
- print("ERROR: can't set get_siterates=True and make_guidetrees=False")
78
- print("Setting make_guidetrees=True and proceeding...")
79
- make_guidetrees=True
80
-
81
- clades=[]
82
- poplabels=[]
83
- indlabels=[]
84
- for i in range(num_clades):
85
- clades.append(("pop"+str(i)))
86
- for j in range(samples_per_clade):
87
- poplabels.append(("pop"+str(i)))
88
- indlabels.append(("pop"+str(i)+"_"+str(j)))
89
- outgroup = "pop"+str(num_clades-1)+"_"
90
-
91
- ####### Varying clade vs. stem heights
92
- for clade_height in [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]:
93
- print("clade heights: ", clade_height)
94
- stem_height = np.around(0.01-clade_height, decimals=3)
95
- print("stem height: ", stem_height)
96
-
97
- #skeleton tree as newick
98
- skeleton_tree = toytree.rtree.unittree(ntips=num_clades,
99
- treeheight=stem_height,
100
- random_names=False,
101
- seed=random.randint(1, (sys.maxsize * 2 + 1))).write(tree_format=5)
102
- #grab newick trees for each clade
103
- pop_idx=0
104
- guidetree = skeleton_tree
105
- for clade in clades:
106
- clade_tree = toytree.rtree.unittree(ntips=samples_per_clade,
107
- treeheight=clade_height,
108
- random_names=False,
109
- seed=random.randint(1, (sys.maxsize * 2 + 1))).write(tree_format=5)
110
- clade_tree = clade_tree.replace(";","")
111
- for i in range(samples_per_clade):
112
- #indlabels.append((clade+"_"+str(j)))
113
- clade_tree = re.sub("r", (clade+"_"), clade_tree)
114
- guidetree = guidetree.replace(("r"+str(pop_idx)), clade_tree)
115
- pop_idx+=1
116
-
117
- base="c"+str(clade_height)+"_s"+str(stem_height)
118
- tobj=toytree.tree(guidetree, tree_format=0)
119
-
120
-
121
- #Set up directory structure for this set of tree params
122
- treeset_path = "sim_"+base
123
- if not os.path.exists(treeset_path):
124
- os.mkdir(treeset_path)
125
-
126
- #save guide trees
127
- basic_tree_plot(tobj, (treeset_path+"/"+base+"_guidetree.pdf"))
128
- tobj.write((treeset_path+"/"+base+"_guidetree.tre"), tree_format=5)
129
-
130
-
131
- ######## With and without rate heterogeneity
132
- #NOTE: Run alignments through IQ-TREE to get optional
133
- #Rate matrix and site-specific mutation rates
134
- data = dict()
135
- for ind in indlabels:
136
- data[ind] = list()
137
-
138
- my_tree = pyvolve.read_tree(tree=guidetree)
139
-
140
- #for model in ["gtr","gtrgamma"]:
141
- for model in ["gtrgamma"]:
142
- model_outpath=treeset_path+"/"+base+"_"+model
143
- if not os.path.exists(model_outpath):
144
- os.mkdir(model_outpath)
145
-
146
- for locus in range(num_loci):
147
- print(locus)
148
- f = np.random.random(4)
149
- f /= f.sum()
150
- parameters = {
151
- "mu":
152
- {"AC": np.random.uniform(low=0.0, high=1.0),
153
- "AG": np.random.uniform(low=0.0, high=1.0),
154
- "AT": np.random.uniform(low=0.0, high=1.0),
155
- "CG": np.random.uniform(low=0.0, high=1.0),
156
- "CT": np.random.uniform(low=0.0, high=1.0),
157
- "GT": np.random.uniform(low=0.0, high=1.0)},
158
- "state_freqs":
159
- [f[0], f[1], f[2], f[3]]
160
- }
161
- if model == "gtr":
162
- #GTR model, without rate heterogeneity
163
- my_model = pyvolve.Model("nucleotide",
164
- parameters)
165
- else:
166
- my_model = pyvolve.Model("nucleotide",
167
- parameters,
168
- rate_factors = [
169
- np.random.uniform(low=0.1, high=0.7, size=1),
170
- np.random.uniform(low=0.5, high=1.2, size=1),
171
- np.random.uniform(low=1.0, high=1.8, size=1),
172
- np.random.uniform(low=1.5, high=5.0, size=1)
173
- ],
174
- rate_probs = [0.4, 0.3, 0.2, 0.1] )
175
- if write_gene_alignments:
176
- fasta_outpath=model_outpath + "/full_alignments"
177
- if not os.path.exists(fasta_outpath):
178
- os.mkdir(fasta_outpath)
179
- else:
180
- fasta_outpath=model_outpath
181
- fastaout=fasta_outpath +"/"+ base+"_"+model+"_loc"+str(locus) + "_gene-alignment.fasta"
182
- #sample a gene alignment
183
- loc = sample_locus(my_tree, my_model, loc_length, snps_per_locus, fastaout)
184
-
185
- if loc:
186
- #sample SNP(s) from gene alignment
187
- sampled = sample_snp(read_fasta(fastaout), loc_length, snps_per_locus)
188
- if sampled is not None:
189
- data = add_locus(data,sampled)
190
-
191
- if not write_gene_alignments:
192
- os.remove(fastaout)
193
- if make_gene_trees:
194
- print("ERROR: Can't make gene trees when write_gene_alignments = False")
195
- elif make_gene_trees:
196
- run_iqtree(fastaout,
197
- iqtree_path=iqtree_bin,
198
- keep_all=keep_all,
199
- keep_report=keep_report,
200
- rates=get_siterates,
201
- procs=iq_procs)
202
- reroot_tree(tree=(fastaout+".treefile"),
203
- rooted=(fastaout+".rooted.tre"),
204
- outgroup_wildcard=outgroup)
205
-
206
- #write full SNP alignment & generate tree
207
- all_snp_out=model_outpath+"/"+base+"_"+model+"_base-snps-concat.phylip"
208
- write_phylip(data, all_snp_out)
209
- if make_guidetrees:
210
- run_iqtree(all_snp_out,
211
- iqtree_path=iqtree_bin,
212
- keep_all=keep_all,
213
- keep_report=keep_report,
214
- rates=get_siterates,
215
- procs=iq_procs)
216
- reroot_tree(tree=(all_snp_out+".treefile"),
217
- rooted=(all_snp_out+".rooted.tre"),
218
- outgroup_wildcard=outgroup)
219
-
220
- ######## Varying introgression weight
221
- num_sampled_loci = len(data[indlabels[0]])
222
- #Modeled as contemporary exchange from pop2 -> pop1
223
- model_base=model_outpath+"/"+base+"_"+model
224
-
225
- for alpha in [0.1, 0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
226
- #for alpha in [0.1, 0.5, 0.9]:
227
- alpha_base=model_base + "_i" + str(alpha)
228
-
229
- #TO-DO: write a logfile recording which loci are introgressed?
230
- source_pool = [indlabels[i] for i, pop in enumerate(poplabels) if pop == "pop2"]
231
- target_pool = [indlabels[i] for i, pop in enumerate(poplabels) if pop == "pop1"]
232
- if alpha == 0.0:
233
- introgressed_data = copy.copy(data)
234
- else:
235
- introgressed_data = hybridization(data,
236
- prob=alpha,
237
- source=source_pool,
238
- target=target_pool)
239
-
240
- introgessed_aln_out=alpha_base + "_base-snps-concat.phylip"
241
- write_phylip(introgressed_data, introgessed_aln_out)
242
- if make_guidetrees:
243
- run_iqtree(introgessed_aln_out,
244
- iqtree_path=iqtree_bin,
245
- keep_all=keep_all,
246
- keep_report=keep_report,
247
- rates=get_siterates,
248
- procs=iq_procs)
249
- reroot_tree(tree=(introgessed_aln_out+".treefile"),
250
- rooted=(introgessed_aln_out+".rooted.tre"),
251
- outgroup_wildcard=outgroup)
252
-
253
- def reroot_tree(tree, rooted="out.rooted.tre", outgroup_wildcard="out"):
254
- t=toytree.tree(tree)
255
- try:
256
- rt=t.root(wildcard=outgroup_wildcard)
257
- rt.write(rooted, tree_format=5)
258
- return(rt)
259
- except Exception:
260
- t.write(rooted, tree_format=5)
261
- return(None)
262
-
263
-
264
- def hybridization(dat, prob=0.1, source=None, target=None):
265
- new_dat=dict()
266
- if source is None:
267
- source = [key for key in dat.keys()]
268
- if target is None:
269
- target = [key for key in dat.keys()]
270
-
271
- for individual in dat.keys():
272
- new_dat[individual] = dat[individual]
273
- aln_len=len(dat[individual])
274
- all_indices=list(range(aln_len))
275
- num=int(aln_len*prob)
276
-
277
- for target_individual in target:
278
- snp_indices = np.random.choice(all_indices, size=num, replace=False)
279
- for index in snp_indices:
280
- source_ind=np.random.choice(source, size=1)[0]
281
- new_dat[target_individual][index] = new_dat[source_ind][index]
282
- return(new_dat)
283
-
284
- def run_iqtree(aln,
285
- iqtree_path="iqtree",
286
- keep_all=False,
287
- keep_report=False,
288
- outgroup=None,
289
- rates=False,
290
- procs=4):
291
- #run
292
- cmd = [iqtree_path,
293
- "-s",
294
- str(aln),
295
- "-m",
296
- "GTR+I*G4",
297
- "-redo",
298
- "-T",
299
- str(procs)
300
- ]
301
- if outgroup is not None:
302
- cmd.append("-o")
303
- cmd.append(str(outgroup))
304
- if rates:
305
- #use -wst (NOT -mlrate) if using iq-tree 1.6xx
306
- #cmd.append("-wsr")
307
- cmd.append("--mlrate")
308
- result = subprocess.run(cmd, capture_output=True, text=True)
309
- #print(result.stdout)
310
- #print(result.stderr)
311
-
312
- if not keep_all:
313
- #delete everything except treefile
314
- silentremove((aln + ".bionj"))
315
- silentremove((aln + ".ckp.gz"))
316
- silentremove((aln + ".log"))
317
- silentremove((aln + ".mldist"))
318
- silentremove((aln + ".uniqueseq.phy"))
319
- if not keep_report:
320
- silentremove((aln + ".iqtree"))
321
- return((aln + ".treefile"))
322
-
323
- def add_locus(d, new):
324
- for sample in d.keys():
325
- for snp in new[sample]:
326
- d[sample].append(snp)
327
- return(d)
328
-
329
- def write_fasta(seqs, fas):
330
- with open(fas, 'w') as fh:
331
- #Write seqs to FASTA first
332
- for a in seqs.keys():
333
- name = ">" + str(a) + "\n"
334
- seq = "".join(seqs[a]) + "\n"
335
- fh.write(name)
336
- fh.write(seq)
337
- fh.close()
338
-
339
- def write_phylip(seqs, phy):
340
- #get header
341
- samps=0
342
- snps=None
343
- for key in seqs.keys():
344
- samps+=1
345
- if snps is None:
346
- snps = len(seqs[key])
347
- elif snps != len(seqs[key]):
348
- raise ValueError(("Error writing file"+phy+"- sequences not equal length\n"))
349
- with open(phy, 'w') as fh:
350
- header=str(samps)+"\t"+str(snps)+"\n"
351
- fh.write(header)
352
- #Write seqs to FASTA first
353
- for a in seqs.keys():
354
- line = str(a) + "\t" + "".join(seqs[a]) + "\n"
355
- fh.write(line)
356
- fh.close()
357
-
358
- def read_phylip(phy):
359
- data = dict()
360
- header=True
361
- sample=None
362
- with open(phy, "r") as fin:
363
- for line in fin:
364
- line = line.strip()
365
- if not line: # If blank line.
366
- continue
367
- else:
368
- if header==True:
369
- header=False
370
- continue
371
- else:
372
- stuff = line.split()
373
- data[stuff[0]] = stuff[1]
374
- fin.close()
375
- return(data)
376
-
377
- def read_fasta(fasta):
378
- data = dict()
379
- header=False
380
- sample=None
381
- sequence=""
382
- with open(fasta, "r") as fin:
383
- for line in fin:
384
- line = line.strip()
385
- if not line: # If blank line.
386
- continue
387
- if line[0] == ">":
388
- if sample:
389
- data[sample] = sequence
390
- sequence = ""
391
- sample=line[1:]
392
- else:
393
- sequence = sequence + line
394
- data[sample] = sequence
395
- fin.close()
396
- return(data)
397
-
398
- def sample_snp(aln_dict, aln_len, snps_per_locus=1):
399
- snp_indices = []
400
- snp_aln = dict()
401
- if aln_len == 1:
402
- for sample in aln_dict.keys():
403
- snp_aln[sample] = aln_dict[sample][0]
404
- return(snp_aln)
405
- else:
406
- for sample in aln_dict.keys():
407
- snp_aln[sample] = []
408
- for i in range(aln_len):
409
- vars=[]
410
- for sample in aln_dict.keys():
411
- nuc=aln_dict[sample][i]
412
- if len(vars) == 0:
413
- vars.append(nuc)
414
- elif nuc not in vars:
415
- snp_indices.append(i)
416
- break
417
- if len(snp_indices) == 0:
418
- return(None)
419
- elif len(snp_indices) == 1:
420
- #sample them all
421
- for sample in aln_dict.keys():
422
- snp_aln[sample] = aln_dict[sample][snp_indices[0]]
423
- else:
424
- sampled_indices = np.random.choice(snp_indices, size=snps_per_locus, replace=False)
425
- for sample in aln_dict.keys():
426
- for i in sampled_indices:
427
- snp_aln[sample].append(aln_dict[sample][i])
428
- return(snp_aln)
429
-
430
- def sample_locus(tree, model, gene_len=1000, num_snps=1, out="out.fasta"):
431
- try:
432
- my_partition = pyvolve.Partition(models = model, size=gene_len)
433
- my_evolver = pyvolve.Evolver(partitions = my_partition, tree = tree)
434
- my_evolver(seqfile = out,
435
- seqfmt = "fasta",
436
- ratefile=False,
437
- infofile=False)
438
- return(True)
439
- except Exception:
440
- return False
441
-
442
- def silentremove(filename):
443
- try:
444
- os.remove(filename)
445
- except OSError as e: # this would be "except OSError, e:" before Python 2.6
446
- if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
447
- raise # re-raise exception if a different error occurred
448
-
449
- def get_tree_tips(tree):
450
- tips = re.split('[ ,\(\);]', tree)
451
- return([i for i in tips if i])
452
-
453
- def basic_tree_plot(tree, out="out.pdf"):
454
- mystyle = {
455
- "edge_type": "p",
456
- "edge_style": {
457
- "stroke": toytree.colors[0],
458
- "stroke-width": 1,
459
- },
460
- "tip_labels_align": True,
461
- "tip_labels_style": {"font-size": "5px"},
462
- "node_labels": False,
463
- "tip_labels": True
464
- }
465
-
466
- canvas, axes, mark = tree.draw(
467
- width=400,
468
- height=600,
469
- **mystyle,
470
- )
471
-
472
- toyplot.pdf.render(canvas, out)
473
-
474
- if __name__ == "__main__":
475
- main()
test/__init__.py DELETED
File without changes
test/pg_sui_simtest.py DELETED
@@ -1,215 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- # Standard library imports
4
- import argparse
5
- import sys
6
-
7
- import numpy as np
8
- import pandas as pd
9
- import scipy.stats as stats
10
- from sklearn_genetic.space import Continuous, Categorical, Integer
11
-
12
- from utils.misc import get_processor_name
13
- from utils.misc import generate_012_genotypes
14
-
15
- # Custom module imports
16
- from snpio import GenotypeData
17
- from read_input.simgenodata import SimGenotypeData
18
- from impute.estimators import *
19
- from impute.simple_imputers import *
20
-
21
-
22
- def main():
23
- """[Class instantiations and main package body]"""
24
-
25
- args = get_arguments()
26
-
27
- if args.str and args.phylip:
28
- sys.exit("Error: Only one file type can be specified")
29
-
30
- # If VCF file is specified.
31
- if args.str:
32
- if not args.pop_ids and args.popmap is None:
33
- raise TypeError("Either --pop_ids or --popmap must be specified\n")
34
-
35
- if args.pop_ids:
36
- print("\n--pop_ids was specified as column 2\n")
37
- else:
38
- print(
39
- "\n--pop_ids was not specified; "
40
- "using popmap file to get population IDs\n"
41
- )
42
-
43
- if args.onerow_perind:
44
- print("\nUsing one row per individual...\n")
45
- else:
46
- print("\nUsing two rows per individual...\n")
47
-
48
- if args.onerow_perind:
49
- data = GenotypeData(
50
- filename=args.str,
51
- filetype="structure1row",
52
- popmapfile=args.popmap,
53
- guidetree=args.treefile,
54
- qmatrix_iqtree=args.iqtree,
55
- )
56
- else:
57
- data = GenotypeData(
58
- filename=args.str,
59
- filetype="structure2row",
60
- popmapfile=args.popmap,
61
- guidetree=args.treefile,
62
- qmatrix_iqtree=args.iqtree,
63
- )
64
-
65
- if args.phylip:
66
- if args.pop_ids or args.onerow_perind:
67
- print(
68
- "\nPhylip file was used with structure arguments; ignoring "
69
- "structure file arguments\n"
70
- )
71
-
72
- if args.popmap is None:
73
- raise TypeError("No popmap file supplied with PHYLIP file\n")
74
-
75
- data = GenotypeData(
76
- filename=args.phylip,
77
- filetype="phylip",
78
- popmapfile=args.popmap,
79
- guidetree=args.treefile,
80
- qmatrix_iqtree=args.iqtree,
81
- siterates_iqtree=args.rates,
82
- )
83
-
84
- prefix = "c0.001_s0.009_gtrgamma_i0.0"
85
- sim = SimGenotypeData(data, prop_missing=0.1, strategy="random")
86
-
87
- nmf = ImputeNMF(genotype_data=sim)
88
-
89
- accuracy = sim.accuracy(nmf)
90
- print("Accuracy:", accuracy)
91
-
92
- phylo = ImputePhylo(genotype_data=sim, save_plots=False)
93
-
94
- accuracy = sim.accuracy(phylo)
95
- print("Accuracy:", accuracy)
96
-
97
- nlpca = ImputeNLPCA(
98
- genotype_data=sim, initial_strategy="populations", cv=5
99
- )
100
- accuracy = sim.accuracy(nlpca)
101
- print("Accuracy:", accuracy)
102
-
103
- ubp = ImputeUBP(genotype_data=sim, initial_strategy="populations")
104
- accuracy = sim.accuracy(ubp)
105
- print("Accuracy:", accuracy)
106
-
107
- # vae = ImputeVAE(
108
- # genotype_data=sim,
109
- # initial_strategy="populations"
110
- # )
111
- # accuracy = sim.accuracy(vae)
112
- # print("Accuracy:",accuracy)
113
-
114
-
115
- def get_arguments():
116
- """[Parse command-line arguments. Imported with argparse]
117
-
118
- Returns:
119
- [argparse object]: [contains command-line arguments; accessed as method]
120
- """
121
-
122
- parser = argparse.ArgumentParser(
123
- description="Simulate missing data on GenotypeData object",
124
- add_help=False,
125
- )
126
-
127
- required_args = parser.add_argument_group("Required arguments")
128
- filetype_args = parser.add_argument_group(
129
- "File type arguments (choose only one)"
130
- )
131
- structure_args = parser.add_argument_group("Structure file arguments")
132
- optional_args = parser.add_argument_group("Optional arguments")
133
-
134
- # File Type arguments
135
- filetype_args.add_argument(
136
- "-s", "--str", type=str, required=False, help="Input structure file"
137
- )
138
- filetype_args.add_argument(
139
- "-p", "--phylip", type=str, required=False, help="Input phylip file"
140
- )
141
-
142
- filetype_args.add_argument(
143
- "-t",
144
- "--treefile",
145
- type=str,
146
- required=False,
147
- default=None,
148
- help="Newick-formatted treefile",
149
- )
150
-
151
- # Structure Arguments
152
- structure_args.add_argument(
153
- "--onerow_perind",
154
- default=False,
155
- action="store_true",
156
- help="Toggles on one row per individual option in structure file",
157
- )
158
- structure_args.add_argument(
159
- "--pop_ids",
160
- default=False,
161
- required=False,
162
- action="store_true",
163
- help="Toggles on population ID column (2nd col) in structure file",
164
- )
165
-
166
- ## Optional Arguments
167
- optional_args.add_argument(
168
- "-m",
169
- "--popmap",
170
- type=str,
171
- required=False,
172
- default=None,
173
- help="Two-column tab-separated population map file: inds\tpops. No header line",
174
- )
175
- optional_args.add_argument(
176
- "-i",
177
- "--iqtree",
178
- type=str,
179
- required=False,
180
- help=".iqtree output file containing Rate Matrix Q",
181
- )
182
-
183
- optional_args.add_argument(
184
- "-r",
185
- "--rates",
186
- type=str,
187
- required=False,
188
- help="IQ-TREE site-rates output file",
189
- )
190
-
191
- optional_args.add_argument(
192
- "--prefix",
193
- type=str,
194
- required=False,
195
- default="output",
196
- help="Prefix for output files",
197
- )
198
-
199
- # Add help menu
200
- optional_args.add_argument(
201
- "-h", "--help", action="help", help="Displays this help menu"
202
- )
203
-
204
- # If no command-line arguments are called then exit and call help menu.
205
- if len(sys.argv) == 1:
206
- print("\nExiting because no command-line options were called.\n")
207
- parser.print_help(sys.stderr)
208
- sys.exit(1)
209
-
210
- args = parser.parse_args()
211
- return args
212
-
213
-
214
- if __name__ == "__main__":
215
- main()