pg-sui 0.2.0__py3-none-any.whl → 1.6.14.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/METADATA +101 -79
- pg_sui-1.6.14.dev9.dist-info/RECORD +81 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info}/WHEEL +1 -1
- pg_sui-1.6.14.dev9.dist-info/entry_points.txt +4 -0
- {pg_sui-0.2.0.dist-info → pg_sui-1.6.14.dev9.dist-info/licenses}/LICENSE +0 -0
- pg_sui-1.6.14.dev9.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +909 -0
- pgsui/data_processing/__init__.py +0 -0
- pgsui/data_processing/config.py +565 -0
- pgsui/data_processing/containers.py +1424 -0
- pgsui/data_processing/transformers.py +557 -907
- pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/extra-resources/.gitkeep +1 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +227 -0
- pgsui/electron/app/package-lock.json +6894 -0
- pgsui/electron/app/package.json +51 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +157 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +131 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +57 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/__init__.py +0 -0
- pgsui/example_data/phylip_files/__init__.py +0 -0
- pgsui/example_data/phylip_files/test.phy +0 -0
- pgsui/example_data/popmaps/__init__.py +0 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/structure_files/__init__.py +0 -0
- pgsui/example_data/structure_files/test.pops.2row.allsites.str +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/__init__.py +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +725 -0
- pgsui/impute/deterministic/imputers/mode.py +844 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +973 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +669 -0
- pgsui/impute/supervised/__init__.py +0 -0
- pgsui/impute/supervised/base.py +343 -0
- pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +317 -0
- pgsui/impute/supervised/imputers/random_forest.py +291 -0
- pgsui/impute/unsupervised/__init__.py +0 -0
- pgsui/impute/unsupervised/base.py +1118 -0
- pgsui/impute/unsupervised/callbacks.py +92 -262
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
- pgsui/impute/unsupervised/imputers/autoencoder.py +1285 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1554 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1575 -0
- pgsui/impute/unsupervised/imputers/vae.py +1228 -0
- pgsui/impute/unsupervised/loss_functions.py +261 -0
- pgsui/impute/unsupervised/models/__init__.py +0 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +215 -567
- pgsui/impute/unsupervised/models/nlpca_model.py +155 -394
- pgsui/impute/unsupervised/models/ubp_model.py +180 -1106
- pgsui/impute/unsupervised/models/vae_model.py +269 -630
- pgsui/impute/unsupervised/nn_scorers.py +255 -0
- pgsui/utils/__init__.py +0 -0
- pgsui/utils/classification_viz.py +608 -0
- pgsui/utils/logging_utils.py +22 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +996 -829
- pgsui/utils/pretty_metrics.py +290 -0
- pgsui/utils/scorers.py +213 -666
- pg_sui-0.2.0.dist-info/RECORD +0 -75
- pg_sui-0.2.0.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -1268
- pgsui/impute/impute.py +0 -1463
- pgsui/impute/simple_imputers.py +0 -1431
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -782
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1024
- pgsui/impute/unsupervised/keras_classifiers.py +0 -697
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1440
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1395
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -151
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -185
simulation/sim_treeparams.py
DELETED
|
@@ -1,475 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
import sys
|
|
3
|
-
import os
|
|
4
|
-
import subprocess
|
|
5
|
-
import errno
|
|
6
|
-
|
|
7
|
-
import toytree
|
|
8
|
-
import toyplot.pdf
|
|
9
|
-
import pyvolve
|
|
10
|
-
import copy
|
|
11
|
-
import random
|
|
12
|
-
import re
|
|
13
|
-
import numpy as np
|
|
14
|
-
import pandas as pd
|
|
15
|
-
|
|
16
|
-
import matplotlib.pyplot as plt
|
|
17
|
-
|
|
18
|
-
def main():
|
|
19
|
-
"""
|
|
20
|
-
Using pyvolve and toytree to simulate data for PG-SUI
|
|
21
|
-
|
|
22
|
-
Two ways to run:
|
|
23
|
-
- Simulate SNPs along a 'pseudo-chromosome' from which SNPs are sampled
|
|
24
|
-
- Simulate genes/ loci, sampling SNPs separately for each
|
|
25
|
-
|
|
26
|
-
Pseudo-chromosome(s):
|
|
27
|
-
Set num_loci = 1 (or number of desired chromosomes)
|
|
28
|
-
loc_length = chromosome length (e.g., 50000)
|
|
29
|
-
snps_per_locus = # total snps you want (e.g., 1000)
|
|
30
|
-
make_gene_trees = False
|
|
31
|
-
|
|
32
|
-
Separate loci (e.g., you want gene trees)
|
|
33
|
-
Set num_loci = number of genes/ loci (e.g., 1000)
|
|
34
|
-
loc_length = locus length (e.g., 500 or 1000)
|
|
35
|
-
snps_per_locus = 1 (usually, but could be higher)
|
|
36
|
-
make_gene_trees = True (if you want gene trees)
|
|
37
|
-
|
|
38
|
-
If you just want a SNP matrix for testing PG-SUI, option 1
|
|
39
|
-
is faster, and functionally not very different given the simulation
|
|
40
|
-
model has no explicit mechanism of linkage (i.e. so site independence
|
|
41
|
-
is true regardless). The second option will create
|
|
42
|
-
a greater amount of rate heterogeneity, since each locus will be
|
|
43
|
-
initialized with its own rate matrix (GTR or GTR+Gamma). In either case,
|
|
44
|
-
setting write_gene_alignments = True will create sub-directories called
|
|
45
|
-
'full_alignments/' containing the full sequences from with snps_per_locus
|
|
46
|
-
number of SNPs will be sampled (without replacement) and concatenated to
|
|
47
|
-
create the final outputs.
|
|
48
|
-
|
|
49
|
-
NOTE: This script is not a part of the PG-SUI API, and is written
|
|
50
|
-
for a single purpose, i.e., is not generalized beyond some options
|
|
51
|
-
which can be manually set below. It is intended to provide transparency
|
|
52
|
-
for the simulation process used in the PG-SUI manuscript, *not* as
|
|
53
|
-
a flexible/ portable tool -- meaning a lot of things are hard-coded.
|
|
54
|
-
|
|
55
|
-
"""
|
|
56
|
-
seed=1234
|
|
57
|
-
random.seed(seed)
|
|
58
|
-
|
|
59
|
-
num_clades=4
|
|
60
|
-
samples_per_clade=20
|
|
61
|
-
num_loci=1000
|
|
62
|
-
loc_length=250
|
|
63
|
-
write_gene_alignments=False
|
|
64
|
-
make_gene_trees=False
|
|
65
|
-
make_guidetrees=True #set to true to run IQTREE on simulated SNP matrices
|
|
66
|
-
keep_all=False #set to true to keep ALL iqtree outputs
|
|
67
|
-
keep_report=True #set to true to keep .iqtree files
|
|
68
|
-
get_siterates=True #set to true to infer site-specific rates in IQTREE
|
|
69
|
-
snps_per_locus=1
|
|
70
|
-
iqtree_bin="iqtree2"
|
|
71
|
-
get_rates=True
|
|
72
|
-
iq_procs=4
|
|
73
|
-
|
|
74
|
-
###################
|
|
75
|
-
|
|
76
|
-
if get_siterates and not make_guidetrees:
|
|
77
|
-
print("ERROR: can't set get_siterates=True and make_guidetrees=False")
|
|
78
|
-
print("Setting make_guidetrees=True and proceeding...")
|
|
79
|
-
make_guidetrees=True
|
|
80
|
-
|
|
81
|
-
clades=[]
|
|
82
|
-
poplabels=[]
|
|
83
|
-
indlabels=[]
|
|
84
|
-
for i in range(num_clades):
|
|
85
|
-
clades.append(("pop"+str(i)))
|
|
86
|
-
for j in range(samples_per_clade):
|
|
87
|
-
poplabels.append(("pop"+str(i)))
|
|
88
|
-
indlabels.append(("pop"+str(i)+"_"+str(j)))
|
|
89
|
-
outgroup = "pop"+str(num_clades-1)+"_"
|
|
90
|
-
|
|
91
|
-
####### Varying clade vs. stem heights
|
|
92
|
-
for clade_height in [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]:
|
|
93
|
-
print("clade heights: ", clade_height)
|
|
94
|
-
stem_height = np.around(0.01-clade_height, decimals=3)
|
|
95
|
-
print("stem height: ", stem_height)
|
|
96
|
-
|
|
97
|
-
#skeleton tree as newick
|
|
98
|
-
skeleton_tree = toytree.rtree.unittree(ntips=num_clades,
|
|
99
|
-
treeheight=stem_height,
|
|
100
|
-
random_names=False,
|
|
101
|
-
seed=random.randint(1, (sys.maxsize * 2 + 1))).write(tree_format=5)
|
|
102
|
-
#grab newick trees for each clade
|
|
103
|
-
pop_idx=0
|
|
104
|
-
guidetree = skeleton_tree
|
|
105
|
-
for clade in clades:
|
|
106
|
-
clade_tree = toytree.rtree.unittree(ntips=samples_per_clade,
|
|
107
|
-
treeheight=clade_height,
|
|
108
|
-
random_names=False,
|
|
109
|
-
seed=random.randint(1, (sys.maxsize * 2 + 1))).write(tree_format=5)
|
|
110
|
-
clade_tree = clade_tree.replace(";","")
|
|
111
|
-
for i in range(samples_per_clade):
|
|
112
|
-
#indlabels.append((clade+"_"+str(j)))
|
|
113
|
-
clade_tree = re.sub("r", (clade+"_"), clade_tree)
|
|
114
|
-
guidetree = guidetree.replace(("r"+str(pop_idx)), clade_tree)
|
|
115
|
-
pop_idx+=1
|
|
116
|
-
|
|
117
|
-
base="c"+str(clade_height)+"_s"+str(stem_height)
|
|
118
|
-
tobj=toytree.tree(guidetree, tree_format=0)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
#Set up directory structure for this set of tree params
|
|
122
|
-
treeset_path = "sim_"+base
|
|
123
|
-
if not os.path.exists(treeset_path):
|
|
124
|
-
os.mkdir(treeset_path)
|
|
125
|
-
|
|
126
|
-
#save guide trees
|
|
127
|
-
basic_tree_plot(tobj, (treeset_path+"/"+base+"_guidetree.pdf"))
|
|
128
|
-
tobj.write((treeset_path+"/"+base+"_guidetree.tre"), tree_format=5)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
######## With and without rate heterogeneity
|
|
132
|
-
#NOTE: Run alignments through IQ-TREE to get optional
|
|
133
|
-
#Rate matrix and site-specific mutation rates
|
|
134
|
-
data = dict()
|
|
135
|
-
for ind in indlabels:
|
|
136
|
-
data[ind] = list()
|
|
137
|
-
|
|
138
|
-
my_tree = pyvolve.read_tree(tree=guidetree)
|
|
139
|
-
|
|
140
|
-
#for model in ["gtr","gtrgamma"]:
|
|
141
|
-
for model in ["gtrgamma"]:
|
|
142
|
-
model_outpath=treeset_path+"/"+base+"_"+model
|
|
143
|
-
if not os.path.exists(model_outpath):
|
|
144
|
-
os.mkdir(model_outpath)
|
|
145
|
-
|
|
146
|
-
for locus in range(num_loci):
|
|
147
|
-
print(locus)
|
|
148
|
-
f = np.random.random(4)
|
|
149
|
-
f /= f.sum()
|
|
150
|
-
parameters = {
|
|
151
|
-
"mu":
|
|
152
|
-
{"AC": np.random.uniform(low=0.0, high=1.0),
|
|
153
|
-
"AG": np.random.uniform(low=0.0, high=1.0),
|
|
154
|
-
"AT": np.random.uniform(low=0.0, high=1.0),
|
|
155
|
-
"CG": np.random.uniform(low=0.0, high=1.0),
|
|
156
|
-
"CT": np.random.uniform(low=0.0, high=1.0),
|
|
157
|
-
"GT": np.random.uniform(low=0.0, high=1.0)},
|
|
158
|
-
"state_freqs":
|
|
159
|
-
[f[0], f[1], f[2], f[3]]
|
|
160
|
-
}
|
|
161
|
-
if model == "gtr":
|
|
162
|
-
#GTR model, without rate heterogeneity
|
|
163
|
-
my_model = pyvolve.Model("nucleotide",
|
|
164
|
-
parameters)
|
|
165
|
-
else:
|
|
166
|
-
my_model = pyvolve.Model("nucleotide",
|
|
167
|
-
parameters,
|
|
168
|
-
rate_factors = [
|
|
169
|
-
np.random.uniform(low=0.1, high=0.7, size=1),
|
|
170
|
-
np.random.uniform(low=0.5, high=1.2, size=1),
|
|
171
|
-
np.random.uniform(low=1.0, high=1.8, size=1),
|
|
172
|
-
np.random.uniform(low=1.5, high=5.0, size=1)
|
|
173
|
-
],
|
|
174
|
-
rate_probs = [0.4, 0.3, 0.2, 0.1] )
|
|
175
|
-
if write_gene_alignments:
|
|
176
|
-
fasta_outpath=model_outpath + "/full_alignments"
|
|
177
|
-
if not os.path.exists(fasta_outpath):
|
|
178
|
-
os.mkdir(fasta_outpath)
|
|
179
|
-
else:
|
|
180
|
-
fasta_outpath=model_outpath
|
|
181
|
-
fastaout=fasta_outpath +"/"+ base+"_"+model+"_loc"+str(locus) + "_gene-alignment.fasta"
|
|
182
|
-
#sample a gene alignment
|
|
183
|
-
loc = sample_locus(my_tree, my_model, loc_length, snps_per_locus, fastaout)
|
|
184
|
-
|
|
185
|
-
if loc:
|
|
186
|
-
#sample SNP(s) from gene alignment
|
|
187
|
-
sampled = sample_snp(read_fasta(fastaout), loc_length, snps_per_locus)
|
|
188
|
-
if sampled is not None:
|
|
189
|
-
data = add_locus(data,sampled)
|
|
190
|
-
|
|
191
|
-
if not write_gene_alignments:
|
|
192
|
-
os.remove(fastaout)
|
|
193
|
-
if make_gene_trees:
|
|
194
|
-
print("ERROR: Can't make gene trees when write_gene_alignments = False")
|
|
195
|
-
elif make_gene_trees:
|
|
196
|
-
run_iqtree(fastaout,
|
|
197
|
-
iqtree_path=iqtree_bin,
|
|
198
|
-
keep_all=keep_all,
|
|
199
|
-
keep_report=keep_report,
|
|
200
|
-
rates=get_siterates,
|
|
201
|
-
procs=iq_procs)
|
|
202
|
-
reroot_tree(tree=(fastaout+".treefile"),
|
|
203
|
-
rooted=(fastaout+".rooted.tre"),
|
|
204
|
-
outgroup_wildcard=outgroup)
|
|
205
|
-
|
|
206
|
-
#write full SNP alignment & generate tree
|
|
207
|
-
all_snp_out=model_outpath+"/"+base+"_"+model+"_base-snps-concat.phylip"
|
|
208
|
-
write_phylip(data, all_snp_out)
|
|
209
|
-
if make_guidetrees:
|
|
210
|
-
run_iqtree(all_snp_out,
|
|
211
|
-
iqtree_path=iqtree_bin,
|
|
212
|
-
keep_all=keep_all,
|
|
213
|
-
keep_report=keep_report,
|
|
214
|
-
rates=get_siterates,
|
|
215
|
-
procs=iq_procs)
|
|
216
|
-
reroot_tree(tree=(all_snp_out+".treefile"),
|
|
217
|
-
rooted=(all_snp_out+".rooted.tre"),
|
|
218
|
-
outgroup_wildcard=outgroup)
|
|
219
|
-
|
|
220
|
-
######## Varying introgression weight
|
|
221
|
-
num_sampled_loci = len(data[indlabels[0]])
|
|
222
|
-
#Modeled as contemporary exchange from pop2 -> pop1
|
|
223
|
-
model_base=model_outpath+"/"+base+"_"+model
|
|
224
|
-
|
|
225
|
-
for alpha in [0.1, 0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
|
|
226
|
-
#for alpha in [0.1, 0.5, 0.9]:
|
|
227
|
-
alpha_base=model_base + "_i" + str(alpha)
|
|
228
|
-
|
|
229
|
-
#TO-DO: write a logfile recording which loci are introgressed?
|
|
230
|
-
source_pool = [indlabels[i] for i, pop in enumerate(poplabels) if pop == "pop2"]
|
|
231
|
-
target_pool = [indlabels[i] for i, pop in enumerate(poplabels) if pop == "pop1"]
|
|
232
|
-
if alpha == 0.0:
|
|
233
|
-
introgressed_data = copy.copy(data)
|
|
234
|
-
else:
|
|
235
|
-
introgressed_data = hybridization(data,
|
|
236
|
-
prob=alpha,
|
|
237
|
-
source=source_pool,
|
|
238
|
-
target=target_pool)
|
|
239
|
-
|
|
240
|
-
introgessed_aln_out=alpha_base + "_base-snps-concat.phylip"
|
|
241
|
-
write_phylip(introgressed_data, introgessed_aln_out)
|
|
242
|
-
if make_guidetrees:
|
|
243
|
-
run_iqtree(introgessed_aln_out,
|
|
244
|
-
iqtree_path=iqtree_bin,
|
|
245
|
-
keep_all=keep_all,
|
|
246
|
-
keep_report=keep_report,
|
|
247
|
-
rates=get_siterates,
|
|
248
|
-
procs=iq_procs)
|
|
249
|
-
reroot_tree(tree=(introgessed_aln_out+".treefile"),
|
|
250
|
-
rooted=(introgessed_aln_out+".rooted.tre"),
|
|
251
|
-
outgroup_wildcard=outgroup)
|
|
252
|
-
|
|
253
|
-
def reroot_tree(tree, rooted="out.rooted.tre", outgroup_wildcard="out"):
|
|
254
|
-
t=toytree.tree(tree)
|
|
255
|
-
try:
|
|
256
|
-
rt=t.root(wildcard=outgroup_wildcard)
|
|
257
|
-
rt.write(rooted, tree_format=5)
|
|
258
|
-
return(rt)
|
|
259
|
-
except Exception:
|
|
260
|
-
t.write(rooted, tree_format=5)
|
|
261
|
-
return(None)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
def hybridization(dat, prob=0.1, source=None, target=None):
|
|
265
|
-
new_dat=dict()
|
|
266
|
-
if source is None:
|
|
267
|
-
source = [key for key in dat.keys()]
|
|
268
|
-
if target is None:
|
|
269
|
-
target = [key for key in dat.keys()]
|
|
270
|
-
|
|
271
|
-
for individual in dat.keys():
|
|
272
|
-
new_dat[individual] = dat[individual]
|
|
273
|
-
aln_len=len(dat[individual])
|
|
274
|
-
all_indices=list(range(aln_len))
|
|
275
|
-
num=int(aln_len*prob)
|
|
276
|
-
|
|
277
|
-
for target_individual in target:
|
|
278
|
-
snp_indices = np.random.choice(all_indices, size=num, replace=False)
|
|
279
|
-
for index in snp_indices:
|
|
280
|
-
source_ind=np.random.choice(source, size=1)[0]
|
|
281
|
-
new_dat[target_individual][index] = new_dat[source_ind][index]
|
|
282
|
-
return(new_dat)
|
|
283
|
-
|
|
284
|
-
def run_iqtree(aln,
|
|
285
|
-
iqtree_path="iqtree",
|
|
286
|
-
keep_all=False,
|
|
287
|
-
keep_report=False,
|
|
288
|
-
outgroup=None,
|
|
289
|
-
rates=False,
|
|
290
|
-
procs=4):
|
|
291
|
-
#run
|
|
292
|
-
cmd = [iqtree_path,
|
|
293
|
-
"-s",
|
|
294
|
-
str(aln),
|
|
295
|
-
"-m",
|
|
296
|
-
"GTR+I*G4",
|
|
297
|
-
"-redo",
|
|
298
|
-
"-T",
|
|
299
|
-
str(procs)
|
|
300
|
-
]
|
|
301
|
-
if outgroup is not None:
|
|
302
|
-
cmd.append("-o")
|
|
303
|
-
cmd.append(str(outgroup))
|
|
304
|
-
if rates:
|
|
305
|
-
#use -wst (NOT -mlrate) if using iq-tree 1.6xx
|
|
306
|
-
#cmd.append("-wsr")
|
|
307
|
-
cmd.append("--mlrate")
|
|
308
|
-
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
309
|
-
#print(result.stdout)
|
|
310
|
-
#print(result.stderr)
|
|
311
|
-
|
|
312
|
-
if not keep_all:
|
|
313
|
-
#delete everything except treefile
|
|
314
|
-
silentremove((aln + ".bionj"))
|
|
315
|
-
silentremove((aln + ".ckp.gz"))
|
|
316
|
-
silentremove((aln + ".log"))
|
|
317
|
-
silentremove((aln + ".mldist"))
|
|
318
|
-
silentremove((aln + ".uniqueseq.phy"))
|
|
319
|
-
if not keep_report:
|
|
320
|
-
silentremove((aln + ".iqtree"))
|
|
321
|
-
return((aln + ".treefile"))
|
|
322
|
-
|
|
323
|
-
def add_locus(d, new):
|
|
324
|
-
for sample in d.keys():
|
|
325
|
-
for snp in new[sample]:
|
|
326
|
-
d[sample].append(snp)
|
|
327
|
-
return(d)
|
|
328
|
-
|
|
329
|
-
def write_fasta(seqs, fas):
|
|
330
|
-
with open(fas, 'w') as fh:
|
|
331
|
-
#Write seqs to FASTA first
|
|
332
|
-
for a in seqs.keys():
|
|
333
|
-
name = ">" + str(a) + "\n"
|
|
334
|
-
seq = "".join(seqs[a]) + "\n"
|
|
335
|
-
fh.write(name)
|
|
336
|
-
fh.write(seq)
|
|
337
|
-
fh.close()
|
|
338
|
-
|
|
339
|
-
def write_phylip(seqs, phy):
|
|
340
|
-
#get header
|
|
341
|
-
samps=0
|
|
342
|
-
snps=None
|
|
343
|
-
for key in seqs.keys():
|
|
344
|
-
samps+=1
|
|
345
|
-
if snps is None:
|
|
346
|
-
snps = len(seqs[key])
|
|
347
|
-
elif snps != len(seqs[key]):
|
|
348
|
-
raise ValueError(("Error writing file"+phy+"- sequences not equal length\n"))
|
|
349
|
-
with open(phy, 'w') as fh:
|
|
350
|
-
header=str(samps)+"\t"+str(snps)+"\n"
|
|
351
|
-
fh.write(header)
|
|
352
|
-
#Write seqs to FASTA first
|
|
353
|
-
for a in seqs.keys():
|
|
354
|
-
line = str(a) + "\t" + "".join(seqs[a]) + "\n"
|
|
355
|
-
fh.write(line)
|
|
356
|
-
fh.close()
|
|
357
|
-
|
|
358
|
-
def read_phylip(phy):
|
|
359
|
-
data = dict()
|
|
360
|
-
header=True
|
|
361
|
-
sample=None
|
|
362
|
-
with open(phy, "r") as fin:
|
|
363
|
-
for line in fin:
|
|
364
|
-
line = line.strip()
|
|
365
|
-
if not line: # If blank line.
|
|
366
|
-
continue
|
|
367
|
-
else:
|
|
368
|
-
if header==True:
|
|
369
|
-
header=False
|
|
370
|
-
continue
|
|
371
|
-
else:
|
|
372
|
-
stuff = line.split()
|
|
373
|
-
data[stuff[0]] = stuff[1]
|
|
374
|
-
fin.close()
|
|
375
|
-
return(data)
|
|
376
|
-
|
|
377
|
-
def read_fasta(fasta):
|
|
378
|
-
data = dict()
|
|
379
|
-
header=False
|
|
380
|
-
sample=None
|
|
381
|
-
sequence=""
|
|
382
|
-
with open(fasta, "r") as fin:
|
|
383
|
-
for line in fin:
|
|
384
|
-
line = line.strip()
|
|
385
|
-
if not line: # If blank line.
|
|
386
|
-
continue
|
|
387
|
-
if line[0] == ">":
|
|
388
|
-
if sample:
|
|
389
|
-
data[sample] = sequence
|
|
390
|
-
sequence = ""
|
|
391
|
-
sample=line[1:]
|
|
392
|
-
else:
|
|
393
|
-
sequence = sequence + line
|
|
394
|
-
data[sample] = sequence
|
|
395
|
-
fin.close()
|
|
396
|
-
return(data)
|
|
397
|
-
|
|
398
|
-
def sample_snp(aln_dict, aln_len, snps_per_locus=1):
|
|
399
|
-
snp_indices = []
|
|
400
|
-
snp_aln = dict()
|
|
401
|
-
if aln_len == 1:
|
|
402
|
-
for sample in aln_dict.keys():
|
|
403
|
-
snp_aln[sample] = aln_dict[sample][0]
|
|
404
|
-
return(snp_aln)
|
|
405
|
-
else:
|
|
406
|
-
for sample in aln_dict.keys():
|
|
407
|
-
snp_aln[sample] = []
|
|
408
|
-
for i in range(aln_len):
|
|
409
|
-
vars=[]
|
|
410
|
-
for sample in aln_dict.keys():
|
|
411
|
-
nuc=aln_dict[sample][i]
|
|
412
|
-
if len(vars) == 0:
|
|
413
|
-
vars.append(nuc)
|
|
414
|
-
elif nuc not in vars:
|
|
415
|
-
snp_indices.append(i)
|
|
416
|
-
break
|
|
417
|
-
if len(snp_indices) == 0:
|
|
418
|
-
return(None)
|
|
419
|
-
elif len(snp_indices) == 1:
|
|
420
|
-
#sample them all
|
|
421
|
-
for sample in aln_dict.keys():
|
|
422
|
-
snp_aln[sample] = aln_dict[sample][snp_indices[0]]
|
|
423
|
-
else:
|
|
424
|
-
sampled_indices = np.random.choice(snp_indices, size=snps_per_locus, replace=False)
|
|
425
|
-
for sample in aln_dict.keys():
|
|
426
|
-
for i in sampled_indices:
|
|
427
|
-
snp_aln[sample].append(aln_dict[sample][i])
|
|
428
|
-
return(snp_aln)
|
|
429
|
-
|
|
430
|
-
def sample_locus(tree, model, gene_len=1000, num_snps=1, out="out.fasta"):
|
|
431
|
-
try:
|
|
432
|
-
my_partition = pyvolve.Partition(models = model, size=gene_len)
|
|
433
|
-
my_evolver = pyvolve.Evolver(partitions = my_partition, tree = tree)
|
|
434
|
-
my_evolver(seqfile = out,
|
|
435
|
-
seqfmt = "fasta",
|
|
436
|
-
ratefile=False,
|
|
437
|
-
infofile=False)
|
|
438
|
-
return(True)
|
|
439
|
-
except Exception:
|
|
440
|
-
return False
|
|
441
|
-
|
|
442
|
-
def silentremove(filename):
|
|
443
|
-
try:
|
|
444
|
-
os.remove(filename)
|
|
445
|
-
except OSError as e: # this would be "except OSError, e:" before Python 2.6
|
|
446
|
-
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
|
|
447
|
-
raise # re-raise exception if a different error occurred
|
|
448
|
-
|
|
449
|
-
def get_tree_tips(tree):
|
|
450
|
-
tips = re.split('[ ,\(\);]', tree)
|
|
451
|
-
return([i for i in tips if i])
|
|
452
|
-
|
|
453
|
-
def basic_tree_plot(tree, out="out.pdf"):
|
|
454
|
-
mystyle = {
|
|
455
|
-
"edge_type": "p",
|
|
456
|
-
"edge_style": {
|
|
457
|
-
"stroke": toytree.colors[0],
|
|
458
|
-
"stroke-width": 1,
|
|
459
|
-
},
|
|
460
|
-
"tip_labels_align": True,
|
|
461
|
-
"tip_labels_style": {"font-size": "5px"},
|
|
462
|
-
"node_labels": False,
|
|
463
|
-
"tip_labels": True
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
canvas, axes, mark = tree.draw(
|
|
467
|
-
width=400,
|
|
468
|
-
height=600,
|
|
469
|
-
**mystyle,
|
|
470
|
-
)
|
|
471
|
-
|
|
472
|
-
toyplot.pdf.render(canvas, out)
|
|
473
|
-
|
|
474
|
-
if __name__ == "__main__":
|
|
475
|
-
main()
|
test/__init__.py
DELETED
|
File without changes
|
test/pg_sui_simtest.py
DELETED
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
|
-
# Standard library imports
|
|
4
|
-
import argparse
|
|
5
|
-
import sys
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import scipy.stats as stats
|
|
10
|
-
from sklearn_genetic.space import Continuous, Categorical, Integer
|
|
11
|
-
|
|
12
|
-
from utils.misc import get_processor_name
|
|
13
|
-
from utils.misc import generate_012_genotypes
|
|
14
|
-
|
|
15
|
-
# Custom module imports
|
|
16
|
-
from snpio import GenotypeData
|
|
17
|
-
from read_input.simgenodata import SimGenotypeData
|
|
18
|
-
from impute.estimators import *
|
|
19
|
-
from impute.simple_imputers import *
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def main():
|
|
23
|
-
"""[Class instantiations and main package body]"""
|
|
24
|
-
|
|
25
|
-
args = get_arguments()
|
|
26
|
-
|
|
27
|
-
if args.str and args.phylip:
|
|
28
|
-
sys.exit("Error: Only one file type can be specified")
|
|
29
|
-
|
|
30
|
-
# If VCF file is specified.
|
|
31
|
-
if args.str:
|
|
32
|
-
if not args.pop_ids and args.popmap is None:
|
|
33
|
-
raise TypeError("Either --pop_ids or --popmap must be specified\n")
|
|
34
|
-
|
|
35
|
-
if args.pop_ids:
|
|
36
|
-
print("\n--pop_ids was specified as column 2\n")
|
|
37
|
-
else:
|
|
38
|
-
print(
|
|
39
|
-
"\n--pop_ids was not specified; "
|
|
40
|
-
"using popmap file to get population IDs\n"
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
if args.onerow_perind:
|
|
44
|
-
print("\nUsing one row per individual...\n")
|
|
45
|
-
else:
|
|
46
|
-
print("\nUsing two rows per individual...\n")
|
|
47
|
-
|
|
48
|
-
if args.onerow_perind:
|
|
49
|
-
data = GenotypeData(
|
|
50
|
-
filename=args.str,
|
|
51
|
-
filetype="structure1row",
|
|
52
|
-
popmapfile=args.popmap,
|
|
53
|
-
guidetree=args.treefile,
|
|
54
|
-
qmatrix_iqtree=args.iqtree,
|
|
55
|
-
)
|
|
56
|
-
else:
|
|
57
|
-
data = GenotypeData(
|
|
58
|
-
filename=args.str,
|
|
59
|
-
filetype="structure2row",
|
|
60
|
-
popmapfile=args.popmap,
|
|
61
|
-
guidetree=args.treefile,
|
|
62
|
-
qmatrix_iqtree=args.iqtree,
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
if args.phylip:
|
|
66
|
-
if args.pop_ids or args.onerow_perind:
|
|
67
|
-
print(
|
|
68
|
-
"\nPhylip file was used with structure arguments; ignoring "
|
|
69
|
-
"structure file arguments\n"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
if args.popmap is None:
|
|
73
|
-
raise TypeError("No popmap file supplied with PHYLIP file\n")
|
|
74
|
-
|
|
75
|
-
data = GenotypeData(
|
|
76
|
-
filename=args.phylip,
|
|
77
|
-
filetype="phylip",
|
|
78
|
-
popmapfile=args.popmap,
|
|
79
|
-
guidetree=args.treefile,
|
|
80
|
-
qmatrix_iqtree=args.iqtree,
|
|
81
|
-
siterates_iqtree=args.rates,
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
prefix = "c0.001_s0.009_gtrgamma_i0.0"
|
|
85
|
-
sim = SimGenotypeData(data, prop_missing=0.1, strategy="random")
|
|
86
|
-
|
|
87
|
-
nmf = ImputeNMF(genotype_data=sim)
|
|
88
|
-
|
|
89
|
-
accuracy = sim.accuracy(nmf)
|
|
90
|
-
print("Accuracy:", accuracy)
|
|
91
|
-
|
|
92
|
-
phylo = ImputePhylo(genotype_data=sim, save_plots=False)
|
|
93
|
-
|
|
94
|
-
accuracy = sim.accuracy(phylo)
|
|
95
|
-
print("Accuracy:", accuracy)
|
|
96
|
-
|
|
97
|
-
nlpca = ImputeNLPCA(
|
|
98
|
-
genotype_data=sim, initial_strategy="populations", cv=5
|
|
99
|
-
)
|
|
100
|
-
accuracy = sim.accuracy(nlpca)
|
|
101
|
-
print("Accuracy:", accuracy)
|
|
102
|
-
|
|
103
|
-
ubp = ImputeUBP(genotype_data=sim, initial_strategy="populations")
|
|
104
|
-
accuracy = sim.accuracy(ubp)
|
|
105
|
-
print("Accuracy:", accuracy)
|
|
106
|
-
|
|
107
|
-
# vae = ImputeVAE(
|
|
108
|
-
# genotype_data=sim,
|
|
109
|
-
# initial_strategy="populations"
|
|
110
|
-
# )
|
|
111
|
-
# accuracy = sim.accuracy(vae)
|
|
112
|
-
# print("Accuracy:",accuracy)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def get_arguments():
|
|
116
|
-
"""[Parse command-line arguments. Imported with argparse]
|
|
117
|
-
|
|
118
|
-
Returns:
|
|
119
|
-
[argparse object]: [contains command-line arguments; accessed as method]
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
parser = argparse.ArgumentParser(
|
|
123
|
-
description="Simulate missing data on GenotypeData object",
|
|
124
|
-
add_help=False,
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
required_args = parser.add_argument_group("Required arguments")
|
|
128
|
-
filetype_args = parser.add_argument_group(
|
|
129
|
-
"File type arguments (choose only one)"
|
|
130
|
-
)
|
|
131
|
-
structure_args = parser.add_argument_group("Structure file arguments")
|
|
132
|
-
optional_args = parser.add_argument_group("Optional arguments")
|
|
133
|
-
|
|
134
|
-
# File Type arguments
|
|
135
|
-
filetype_args.add_argument(
|
|
136
|
-
"-s", "--str", type=str, required=False, help="Input structure file"
|
|
137
|
-
)
|
|
138
|
-
filetype_args.add_argument(
|
|
139
|
-
"-p", "--phylip", type=str, required=False, help="Input phylip file"
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
filetype_args.add_argument(
|
|
143
|
-
"-t",
|
|
144
|
-
"--treefile",
|
|
145
|
-
type=str,
|
|
146
|
-
required=False,
|
|
147
|
-
default=None,
|
|
148
|
-
help="Newick-formatted treefile",
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Structure Arguments
|
|
152
|
-
structure_args.add_argument(
|
|
153
|
-
"--onerow_perind",
|
|
154
|
-
default=False,
|
|
155
|
-
action="store_true",
|
|
156
|
-
help="Toggles on one row per individual option in structure file",
|
|
157
|
-
)
|
|
158
|
-
structure_args.add_argument(
|
|
159
|
-
"--pop_ids",
|
|
160
|
-
default=False,
|
|
161
|
-
required=False,
|
|
162
|
-
action="store_true",
|
|
163
|
-
help="Toggles on population ID column (2nd col) in structure file",
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
## Optional Arguments
|
|
167
|
-
optional_args.add_argument(
|
|
168
|
-
"-m",
|
|
169
|
-
"--popmap",
|
|
170
|
-
type=str,
|
|
171
|
-
required=False,
|
|
172
|
-
default=None,
|
|
173
|
-
help="Two-column tab-separated population map file: inds\tpops. No header line",
|
|
174
|
-
)
|
|
175
|
-
optional_args.add_argument(
|
|
176
|
-
"-i",
|
|
177
|
-
"--iqtree",
|
|
178
|
-
type=str,
|
|
179
|
-
required=False,
|
|
180
|
-
help=".iqtree output file containing Rate Matrix Q",
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
optional_args.add_argument(
|
|
184
|
-
"-r",
|
|
185
|
-
"--rates",
|
|
186
|
-
type=str,
|
|
187
|
-
required=False,
|
|
188
|
-
help="IQ-TREE site-rates output file",
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
optional_args.add_argument(
|
|
192
|
-
"--prefix",
|
|
193
|
-
type=str,
|
|
194
|
-
required=False,
|
|
195
|
-
default="output",
|
|
196
|
-
help="Prefix for output files",
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
# Add help menu
|
|
200
|
-
optional_args.add_argument(
|
|
201
|
-
"-h", "--help", action="help", help="Displays this help menu"
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
# If no command-line arguments are called then exit and call help menu.
|
|
205
|
-
if len(sys.argv) == 1:
|
|
206
|
-
print("\nExiting because no command-line options were called.\n")
|
|
207
|
-
parser.print_help(sys.stderr)
|
|
208
|
-
sys.exit(1)
|
|
209
|
-
|
|
210
|
-
args = parser.parse_args()
|
|
211
|
-
return args
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
if __name__ == "__main__":
|
|
215
|
-
main()
|