treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import dendropy
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
RootTreeGTDB220_usage = '''
|
|
8
|
+
========================================== RootTreeGTDB220 example command ==========================================
|
|
9
|
+
|
|
10
|
+
TreeSAK RootTreeGTDB220 -add_root -d ar -tree ar53.tree -tax ar53.summary.tsv -db db_dir -o ar53.rooted.tree
|
|
11
|
+
TreeSAK RootTreeGTDB220 -add_root -d bac -tree bac120.tree -tax bac120.summary.tsv -db db_dir -o bac120.rooted.tree
|
|
12
|
+
|
|
13
|
+
# Need to download and decompress the following files to your database folder (provide with -db)
|
|
14
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_r220.tree.tar.gz
|
|
15
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_r220.tree.tar.gz
|
|
16
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_metadata_r220.tsv.gz
|
|
17
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_metadata_r220.tsv.gz
|
|
18
|
+
|
|
19
|
+
=====================================================================================================================
|
|
20
|
+
'''
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_smallest_outgroup(tree_object):
|
|
24
|
+
|
|
25
|
+
min_outgroup_leaf_num = 99999
|
|
26
|
+
for each_root_child in tree_object.children:
|
|
27
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
28
|
+
if len(leaf_list) < min_outgroup_leaf_num:
|
|
29
|
+
min_outgroup_leaf_num = len(leaf_list)
|
|
30
|
+
|
|
31
|
+
out_group_leaf_list = []
|
|
32
|
+
for each_root_child in tree_object.children:
|
|
33
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
34
|
+
if len(leaf_list) == min_outgroup_leaf_num:
|
|
35
|
+
out_group_leaf_list = leaf_list
|
|
36
|
+
|
|
37
|
+
return out_group_leaf_list
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sep_taxon_str(taxon_string):
|
|
41
|
+
|
|
42
|
+
taxon_string_split = taxon_string.strip().split(';')
|
|
43
|
+
taxon_p = taxon_string_split[1]
|
|
44
|
+
taxon_c = taxon_string_split[2]
|
|
45
|
+
taxon_o = taxon_string_split[3]
|
|
46
|
+
taxon_f = taxon_string_split[4]
|
|
47
|
+
taxon_g = taxon_string_split[5]
|
|
48
|
+
|
|
49
|
+
return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
|
|
53
|
+
|
|
54
|
+
input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
|
|
55
|
+
|
|
56
|
+
# subset tree
|
|
57
|
+
subset_tree = input_tree.copy()
|
|
58
|
+
subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
|
|
59
|
+
|
|
60
|
+
# rename leaf
|
|
61
|
+
for each_leaf in subset_tree:
|
|
62
|
+
leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
|
|
63
|
+
each_leaf.name = leaf_name_new
|
|
64
|
+
|
|
65
|
+
return subset_tree
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
|
|
69
|
+
|
|
70
|
+
"""
|
|
71
|
+
Reroot the tree using the given outgroup.
|
|
72
|
+
modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
|
|
73
|
+
|
|
74
|
+
input_tree: File containing Newick tree to rerooted.
|
|
75
|
+
output_tree: Name of file for rerooted tree.
|
|
76
|
+
outgroup: Labels of taxa in outgroup.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
|
|
80
|
+
|
|
81
|
+
outgroup_in_tree = set()
|
|
82
|
+
ingroup_leaves = set()
|
|
83
|
+
for n in tree.leaf_node_iter():
|
|
84
|
+
if n.taxon.label in out_group_list:
|
|
85
|
+
outgroup_in_tree.add(n.taxon)
|
|
86
|
+
else:
|
|
87
|
+
ingroup_leaves.add(n)
|
|
88
|
+
|
|
89
|
+
# Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
|
|
90
|
+
# ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
|
|
91
|
+
# outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
|
|
92
|
+
# polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
|
|
93
|
+
# lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
|
|
94
|
+
# this is not guaranteed.
|
|
95
|
+
|
|
96
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
97
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
98
|
+
while True:
|
|
99
|
+
rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
|
|
100
|
+
tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
|
|
101
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
102
|
+
if len(mrca.leaf_nodes()) == mrca_leaves:
|
|
103
|
+
break
|
|
104
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
105
|
+
|
|
106
|
+
if mrca.edge_length is not None:
|
|
107
|
+
tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
|
|
108
|
+
|
|
109
|
+
# tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
110
|
+
tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
111
|
+
tree_out_string = tree_out_string.replace("'", "")
|
|
112
|
+
|
|
113
|
+
# add the root bar
|
|
114
|
+
if add_root_branch is True:
|
|
115
|
+
tree_out_string = '(' + tree_out_string
|
|
116
|
+
tree_out_string = tree_out_string.replace(');', '):0.02);')
|
|
117
|
+
|
|
118
|
+
# write out tree string
|
|
119
|
+
tree_file_rooted_handle = open(tree_file_rooted, 'w')
|
|
120
|
+
tree_file_rooted_handle.write(tree_out_string)
|
|
121
|
+
tree_file_rooted_handle.close()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def RootTreeGTDB220(args):
|
|
125
|
+
|
|
126
|
+
input_unrooted_tree = args['tree']
|
|
127
|
+
user_gnm_taxon = args['tax']
|
|
128
|
+
db_dir = args['db']
|
|
129
|
+
gnm_domain = args['d']
|
|
130
|
+
add_root_branch = args['add_root']
|
|
131
|
+
rooted_tree = args['o']
|
|
132
|
+
|
|
133
|
+
leaf_list = []
|
|
134
|
+
for leaf in Tree(input_unrooted_tree, format=1):
|
|
135
|
+
leaf_name = leaf.name
|
|
136
|
+
leaf_list.append(leaf_name)
|
|
137
|
+
|
|
138
|
+
# define file name
|
|
139
|
+
gtdb_ref_tree_ar = '%s/ar53_r220.tree' % db_dir
|
|
140
|
+
gtdb_ref_tree_bac = '%s/bac120_r220.tree' % db_dir
|
|
141
|
+
gtdb_gnm_meta_ar = '%s/ar53_metadata_r220.tsv' % db_dir
|
|
142
|
+
gtdb_gnm_meta_bac = '%s/bac120_metadata_r220.tsv' % db_dir
|
|
143
|
+
|
|
144
|
+
if gnm_domain == 'bac':
|
|
145
|
+
gtdb_ref_tree = gtdb_ref_tree_bac
|
|
146
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_bac
|
|
147
|
+
elif gnm_domain == 'ar':
|
|
148
|
+
gtdb_ref_tree = gtdb_ref_tree_ar
|
|
149
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_ar
|
|
150
|
+
else:
|
|
151
|
+
print('please provide either "ar" or "bac" to -d')
|
|
152
|
+
exit()
|
|
153
|
+
|
|
154
|
+
tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
|
|
155
|
+
ref_tree_gnm_list = tree.get_leaf_names()
|
|
156
|
+
ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
|
|
157
|
+
|
|
158
|
+
# read in user_gnm_taxon
|
|
159
|
+
user_gnm_taxon_dict_p = dict()
|
|
160
|
+
user_gnm_taxon_dict_c = dict()
|
|
161
|
+
user_gnm_taxon_dict_o = dict()
|
|
162
|
+
user_gnm_taxon_dict_f = dict()
|
|
163
|
+
user_gnm_taxon_dict_g = dict()
|
|
164
|
+
for each_gnm in open(user_gnm_taxon):
|
|
165
|
+
if not each_gnm.startswith('user_genome\t'):
|
|
166
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
167
|
+
gnm_id = each_gnm_split[0]
|
|
168
|
+
gnm_taxon = each_gnm_split[1]
|
|
169
|
+
|
|
170
|
+
if gnm_id in leaf_list:
|
|
171
|
+
count_current_gnm = False
|
|
172
|
+
if gnm_domain == 'bac':
|
|
173
|
+
if 'd__Bacteria' in gnm_taxon:
|
|
174
|
+
count_current_gnm = True
|
|
175
|
+
elif gnm_domain == 'ar':
|
|
176
|
+
if 'd__Archaea' in gnm_taxon:
|
|
177
|
+
count_current_gnm = True
|
|
178
|
+
|
|
179
|
+
if count_current_gnm is True:
|
|
180
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
|
|
181
|
+
|
|
182
|
+
if gnm_p not in user_gnm_taxon_dict_p:
|
|
183
|
+
user_gnm_taxon_dict_p[gnm_p] = set()
|
|
184
|
+
if gnm_c not in user_gnm_taxon_dict_c:
|
|
185
|
+
user_gnm_taxon_dict_c[gnm_c] = set()
|
|
186
|
+
if gnm_o not in user_gnm_taxon_dict_o:
|
|
187
|
+
user_gnm_taxon_dict_o[gnm_o] = set()
|
|
188
|
+
if gnm_f not in user_gnm_taxon_dict_f:
|
|
189
|
+
user_gnm_taxon_dict_f[gnm_f] = set()
|
|
190
|
+
if gnm_g not in user_gnm_taxon_dict_g:
|
|
191
|
+
user_gnm_taxon_dict_g[gnm_g] = set()
|
|
192
|
+
|
|
193
|
+
user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
|
|
194
|
+
user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
|
|
195
|
+
user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
|
|
196
|
+
user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
|
|
197
|
+
user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
|
|
198
|
+
|
|
199
|
+
# determine rooting rank, start from phylum
|
|
200
|
+
rooting_rank = ''
|
|
201
|
+
rooting_rank_taxon_dict = dict()
|
|
202
|
+
if len(user_gnm_taxon_dict_p) > 1:
|
|
203
|
+
rooting_rank = 'p'
|
|
204
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_p
|
|
205
|
+
elif len(user_gnm_taxon_dict_c) > 1:
|
|
206
|
+
rooting_rank = 'c'
|
|
207
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_c
|
|
208
|
+
elif len(user_gnm_taxon_dict_o) > 1:
|
|
209
|
+
rooting_rank = 'o'
|
|
210
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_o
|
|
211
|
+
elif len(user_gnm_taxon_dict_f) > 1:
|
|
212
|
+
rooting_rank = 'f'
|
|
213
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_f
|
|
214
|
+
elif len(user_gnm_taxon_dict_g) > 1:
|
|
215
|
+
rooting_rank = 'g'
|
|
216
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_g
|
|
217
|
+
|
|
218
|
+
if rooting_rank == '':
|
|
219
|
+
print('All user genomes are from the same genus, program exited!')
|
|
220
|
+
exit()
|
|
221
|
+
|
|
222
|
+
col_index = {}
|
|
223
|
+
canditate_gnms_rooting_rank = dict()
|
|
224
|
+
counted_taxons_rooting_rank = set()
|
|
225
|
+
for each_ref in open(gtdb_gnm_metadata):
|
|
226
|
+
each_ref_split = each_ref.strip().split('\t')
|
|
227
|
+
if each_ref.startswith('accession ambiguous_bases'):
|
|
228
|
+
col_index = {key: i for i, key in enumerate(each_ref_split)}
|
|
229
|
+
else:
|
|
230
|
+
ref_accession = each_ref_split[0]
|
|
231
|
+
gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
|
|
232
|
+
if ref_accession in ref_tree_gnm_set:
|
|
233
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
|
|
234
|
+
|
|
235
|
+
gnm_rooting_rank = ''
|
|
236
|
+
if rooting_rank == 'p':
|
|
237
|
+
gnm_rooting_rank = gnm_p
|
|
238
|
+
elif rooting_rank == 'c':
|
|
239
|
+
gnm_rooting_rank = gnm_c
|
|
240
|
+
elif rooting_rank == 'o':
|
|
241
|
+
gnm_rooting_rank = gnm_o
|
|
242
|
+
elif rooting_rank == 'f':
|
|
243
|
+
gnm_rooting_rank = gnm_f
|
|
244
|
+
elif rooting_rank == 'g':
|
|
245
|
+
gnm_rooting_rank = gnm_g
|
|
246
|
+
|
|
247
|
+
# rooting_rank
|
|
248
|
+
if gnm_rooting_rank in rooting_rank_taxon_dict:
|
|
249
|
+
if gnm_rooting_rank not in counted_taxons_rooting_rank:
|
|
250
|
+
counted_taxons_rooting_rank.add(gnm_rooting_rank)
|
|
251
|
+
canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
|
|
252
|
+
|
|
253
|
+
ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
|
|
254
|
+
|
|
255
|
+
# get the smallest out group taxon set
|
|
256
|
+
smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
|
|
257
|
+
|
|
258
|
+
user_gnm_taxon_dict_rooting_rank = dict()
|
|
259
|
+
if rooting_rank == 'p':
|
|
260
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
|
|
261
|
+
elif rooting_rank == 'c':
|
|
262
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
|
|
263
|
+
elif rooting_rank == 'o':
|
|
264
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
|
|
265
|
+
elif rooting_rank == 'f':
|
|
266
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
|
|
267
|
+
elif rooting_rank == 'g':
|
|
268
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
|
|
269
|
+
|
|
270
|
+
# get the smallest out group genome set
|
|
271
|
+
out_group_gnm_set_1 = set()
|
|
272
|
+
out_group_gnm_set_2 = set()
|
|
273
|
+
for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
|
|
274
|
+
gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
|
|
275
|
+
if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
|
|
276
|
+
out_group_gnm_set_1.update(gnm_member_set)
|
|
277
|
+
else:
|
|
278
|
+
out_group_gnm_set_2.update(gnm_member_set)
|
|
279
|
+
|
|
280
|
+
# select the smaller one as outgroup
|
|
281
|
+
if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
|
|
282
|
+
out_group_gnm_set = out_group_gnm_set_1
|
|
283
|
+
else:
|
|
284
|
+
out_group_gnm_set = out_group_gnm_set_2
|
|
285
|
+
|
|
286
|
+
# root user tree with identified out group genomes
|
|
287
|
+
root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == '__main__':
|
|
291
|
+
|
|
292
|
+
RootTreeGTDB220_parser = argparse.ArgumentParser(usage=RootTreeGTDB220_usage)
|
|
293
|
+
RootTreeGTDB220_parser.add_argument('-tree', required=True, help='input unrooted tree')
|
|
294
|
+
RootTreeGTDB220_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
|
|
295
|
+
RootTreeGTDB220_parser.add_argument('-db', required=True, help='GTDB database files')
|
|
296
|
+
RootTreeGTDB220_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
|
|
297
|
+
RootTreeGTDB220_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
|
|
298
|
+
RootTreeGTDB220_parser.add_argument('-o', required=True, help='output folder')
|
|
299
|
+
args = vars(RootTreeGTDB220_parser.parse_args())
|
|
300
|
+
RootTreeGTDB220(args)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
SequentialDating_usage = '''
|
|
3
|
+
======================== SequentialDating example commands ========================
|
|
4
|
+
|
|
5
|
+
TreeSAK SequentialDating -h
|
|
6
|
+
|
|
7
|
+
===================================================================================
|
|
8
|
+
'''
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def SequentialDating():
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SequentialDating()
|
TreeSAK/SingleAleHGT.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def sep_path_basename_ext(file_in):
|
|
6
|
+
|
|
7
|
+
# separate path and file name
|
|
8
|
+
f_path, file_name = os.path.split(file_in)
|
|
9
|
+
if f_path == '':
|
|
10
|
+
f_path = '.'
|
|
11
|
+
|
|
12
|
+
# separate file basename and extension
|
|
13
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
14
|
+
|
|
15
|
+
return f_path, f_base, f_ext
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
SingleAleHGT_usage = '''
|
|
19
|
+
============================================ SingleAleHGT example commands ============================================
|
|
20
|
+
|
|
21
|
+
TreeSAK SingleAleHGT -i concatenated.fasta -s genome.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd
|
|
22
|
+
|
|
23
|
+
=======================================================================================================================
|
|
24
|
+
'''
|
|
25
|
+
|
|
26
|
+
def SingleAleHGT(args):
|
|
27
|
+
|
|
28
|
+
faa_in = args['faa']
|
|
29
|
+
msa_in = args['msa']
|
|
30
|
+
op_dir = args['o']
|
|
31
|
+
genome_tree_file_rooted = args['s']
|
|
32
|
+
API_key = args['api']
|
|
33
|
+
hgt_freq_cutoff = args['fc']
|
|
34
|
+
ar_phylum_color_code_txt = args['color']
|
|
35
|
+
genome_taxon_txt = args['c']
|
|
36
|
+
force_overwrite = args['f']
|
|
37
|
+
trim_msa = args['trim']
|
|
38
|
+
docker_image = args['docker']
|
|
39
|
+
num_threads = args['t']
|
|
40
|
+
|
|
41
|
+
######################################## check input files #######################################
|
|
42
|
+
|
|
43
|
+
# if docker_image is True, check if docker is activated
|
|
44
|
+
if (faa_in is not None) and (msa_in is None):
|
|
45
|
+
f_path, f_base, f_ext = sep_path_basename_ext(faa_in)
|
|
46
|
+
elif (faa_in is None) and (msa_in is not None):
|
|
47
|
+
f_path, f_base, f_ext = sep_path_basename_ext(msa_in)
|
|
48
|
+
else:
|
|
49
|
+
print('Please specify either -faa or -msa, program exited!')
|
|
50
|
+
exit()
|
|
51
|
+
|
|
52
|
+
######################################## define file name ########################################
|
|
53
|
+
|
|
54
|
+
ale1_op_dir = '%s/ALE1_op_dir' % op_dir
|
|
55
|
+
ale2_op_dir = '%s/ALE2_op_dir' % op_dir
|
|
56
|
+
ale4_op_dir = '%s/ALE4_op_dir' % op_dir
|
|
57
|
+
log_txt = '%s/log.txt' % op_dir
|
|
58
|
+
msa_file = '%s/%s.aln' % (ale1_op_dir, f_base)
|
|
59
|
+
msa_trimmed = '%s/%s_trimmed.aln' % (ale1_op_dir, f_base)
|
|
60
|
+
tree_prefix = '%s/%s' % (ale1_op_dir, f_base)
|
|
61
|
+
|
|
62
|
+
###################################### create output folder ######################################
|
|
63
|
+
|
|
64
|
+
if os.path.isdir(op_dir) is True:
|
|
65
|
+
if force_overwrite is True:
|
|
66
|
+
os.system('rm -r %s' % op_dir)
|
|
67
|
+
else:
|
|
68
|
+
print('%s exist, program exited!' % op_dir)
|
|
69
|
+
exit()
|
|
70
|
+
os.mkdir(op_dir)
|
|
71
|
+
os.mkdir(ale1_op_dir)
|
|
72
|
+
|
|
73
|
+
##################################################################################################
|
|
74
|
+
|
|
75
|
+
# run mafft-einsi
|
|
76
|
+
if (faa_in is not None) and (msa_in is None):
|
|
77
|
+
mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_threads, faa_in, msa_file)
|
|
78
|
+
|
|
79
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
80
|
+
log_txt_handle.write(mafft_cmd + '\n')
|
|
81
|
+
os.system(mafft_cmd)
|
|
82
|
+
msa_file_for_next_step = msa_file
|
|
83
|
+
else:
|
|
84
|
+
msa_file_for_next_step = msa_in
|
|
85
|
+
|
|
86
|
+
# run trimal
|
|
87
|
+
if trim_msa is True:
|
|
88
|
+
trimal_cmd = 'trimal -in %s -out %s -automated1' % (msa_file_for_next_step, msa_trimmed)
|
|
89
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
90
|
+
log_txt_handle.write(trimal_cmd + '\n')
|
|
91
|
+
os.system(trimal_cmd)
|
|
92
|
+
iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_trimmed, tree_prefix)
|
|
93
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
94
|
+
log_txt_handle.write(iqtree2_cmd + '\n')
|
|
95
|
+
os.system(iqtree2_cmd)
|
|
96
|
+
else:
|
|
97
|
+
iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_file_for_next_step, tree_prefix)
|
|
98
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
99
|
+
log_txt_handle.write(iqtree2_cmd + '\n')
|
|
100
|
+
os.system(iqtree2_cmd)
|
|
101
|
+
|
|
102
|
+
# run ALE2
|
|
103
|
+
ale2_cmd = 'TreeSAK ALE2 -i %s -s %s -t %s -f -runALE -docker %s -o %s' % (ale1_op_dir, genome_tree_file_rooted, num_threads, docker_image, ale2_op_dir)
|
|
104
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
105
|
+
log_txt_handle.write(ale2_cmd + '\n')
|
|
106
|
+
os.system(ale2_cmd)
|
|
107
|
+
|
|
108
|
+
# run ALE4
|
|
109
|
+
ale4_cmd = 'TreeSAK ALE4 -i1 %s -i2 %s -c %s -color %s -o %s -fc %s -f -api %s' % (ale1_op_dir, ale2_op_dir, genome_taxon_txt, ar_phylum_color_code_txt, ale4_op_dir, hgt_freq_cutoff, API_key)
|
|
110
|
+
with open(log_txt, 'a') as log_txt_handle:
|
|
111
|
+
log_txt_handle.write(ale4_cmd + '\n')
|
|
112
|
+
os.system(ale4_cmd)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == '__main__':
|
|
116
|
+
|
|
117
|
+
SingleAleHGT_parser = argparse.ArgumentParser()
|
|
118
|
+
SingleAleHGT_parser.add_argument('-faa', required=False, default=None, help='input aa file, e.g., OMA0001.faa')
|
|
119
|
+
SingleAleHGT_parser.add_argument('-msa', required=False, default=None, help='input MSA file, e.g., OMA0001.aln')
|
|
120
|
+
SingleAleHGT_parser.add_argument('-o', required=True, help='output dir, e.g., SingleAleHGT_wd')
|
|
121
|
+
SingleAleHGT_parser.add_argument('-s', required=True, help='rooted species tree')
|
|
122
|
+
SingleAleHGT_parser.add_argument('-c', required=True, help='genome_taxon, GTDB format')
|
|
123
|
+
SingleAleHGT_parser.add_argument('-color', required=True, help='phylum color code')
|
|
124
|
+
SingleAleHGT_parser.add_argument('-fc', required=False, type=float, default=0.5, help='hgt_freq_cutoff, default: 0.5')
|
|
125
|
+
SingleAleHGT_parser.add_argument('-mld', required=False, type=int, default=5, help='donor_node_min_leaf_num, default: 5')
|
|
126
|
+
SingleAleHGT_parser.add_argument('-mlr', required=False, type=int, default=5, help='recipient_node_min_leaf_num, default: 5')
|
|
127
|
+
SingleAleHGT_parser.add_argument('-trim', required=False, action="store_true", help='trim MSA')
|
|
128
|
+
SingleAleHGT_parser.add_argument('-docker', required=False, default=None, help='Docker image, if ALE was installed with Docker, e.g., gregmich/alesuite_new')
|
|
129
|
+
SingleAleHGT_parser.add_argument('-itol', required=False, default='batch_access_tmp', help='iTOL project_name, default: batch_access_tmp')
|
|
130
|
+
SingleAleHGT_parser.add_argument('-api', required=True, help='iTOL API key')
|
|
131
|
+
SingleAleHGT_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
|
|
132
|
+
SingleAleHGT_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
133
|
+
args = vars(SingleAleHGT_parser.parse_args())
|
|
134
|
+
SingleAleHGT(args)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
'''
|
|
138
|
+
|
|
139
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/ALE1_op_dir_OMA05484_OMA07484_trimmed
|
|
140
|
+
trimal -in ../ALE1_op_dir_OMA05484_OMA07484/concatenated.fasta -out concatenated.fasta -automated1
|
|
141
|
+
iqtree2 -m LG+G+I -bb 1000 --wbtl -nt 10 -s concatenated.fasta -pre OMA05484_OMA07484
|
|
142
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
|
|
143
|
+
TreeSAK ALE2 -i ALE1_op_dir_OMA05484_OMA07484_trimmed -s genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir_OMA05484_OMA07484_trimmed
|
|
144
|
+
TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
145
|
+
|
|
146
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
|
|
147
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -msa ALE1_op_dir_OMA05484_OMA07484_trimmed/concatenated.fasta -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd -trim
|
|
148
|
+
|
|
149
|
+
cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/demo_SingleAleHGT_wd
|
|
150
|
+
TreeSAK ALE2 -i ALE1_op_dir -s ../genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir
|
|
151
|
+
TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
|
|
152
|
+
|
|
153
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o demo_SingleAleHGT_wd -msa ALE1_op_dir/OMA15312.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
|
|
154
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
|
|
155
|
+
/usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd_no_trim -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -docker gregmich/alesuite_new
|
|
156
|
+
|
|
157
|
+
'''
|
TreeSAK/SingleLinePhy.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from Bio import AlignIO
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SingleLinePhy_usage = '''
|
|
7
|
+
======== SingleLinePhy example commands ========
|
|
8
|
+
|
|
9
|
+
TreeSAK SingleLinePhy -i in.phy -o out.phy
|
|
10
|
+
|
|
11
|
+
================================================
|
|
12
|
+
'''
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def SingleLinePhy(args):
|
|
16
|
+
|
|
17
|
+
phy_in = args['i']
|
|
18
|
+
phy_out = args['o']
|
|
19
|
+
|
|
20
|
+
# check input file
|
|
21
|
+
if os.path.isfile(phy_in) is False:
|
|
22
|
+
print('input file not found, program exited!')
|
|
23
|
+
exit()
|
|
24
|
+
|
|
25
|
+
alignment = AlignIO.read(phy_in, 'phylip-relaxed')
|
|
26
|
+
|
|
27
|
+
max_seq_id_len = 0
|
|
28
|
+
for each_seq in alignment:
|
|
29
|
+
seq_id_len = len(each_seq.id)
|
|
30
|
+
if seq_id_len > max_seq_id_len:
|
|
31
|
+
max_seq_id_len = seq_id_len
|
|
32
|
+
|
|
33
|
+
with open(phy_out, 'w') as msa_out_handle:
|
|
34
|
+
msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
|
|
35
|
+
for each_seq in alignment:
|
|
36
|
+
seq_id = each_seq.id
|
|
37
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
38
|
+
msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
39
|
+
|
|
40
|
+
print('Done!')
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
if __name__ == '__main__':
|
|
44
|
+
|
|
45
|
+
# initialize the options parser
|
|
46
|
+
parser = argparse.ArgumentParser()
|
|
47
|
+
parser.add_argument('-i', required=True, help='input file')
|
|
48
|
+
parser.add_argument('-o', required=True, help='output file')
|
|
49
|
+
args = vars(parser.parse_args())
|
|
50
|
+
SingleLinePhy(args)
|
TreeSAK/SliceMSA.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
from Bio import AlignIO
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SliceMSA_usage = '''
|
|
7
|
+
========================= SliceMSA example commands =========================
|
|
8
|
+
|
|
9
|
+
TreeSAK SliceMSA -i 16S_aln.fasta -s 200-300 -o 16S_aln_200-300.fasta
|
|
10
|
+
TreeSAK SliceMSA -i 16S_aln.phylip -fi phylip-relaxed -s sections.txt -o SliceMSA_op -fo phylip-relaxed
|
|
11
|
+
|
|
12
|
+
# example
|
|
13
|
+
200-300 select columns 200-300
|
|
14
|
+
-100 select columns 1-300
|
|
15
|
+
500- select columns from 500 to the end
|
|
16
|
+
|
|
17
|
+
# Example of sections.txt (one section per line):
|
|
18
|
+
200-300
|
|
19
|
+
-100
|
|
20
|
+
500-
|
|
21
|
+
|
|
22
|
+
# Examples of alignment format (https://biopython.org/wiki/AlignIO):
|
|
23
|
+
fasta, phylip, phylip-relaxed, phylip-sequential, clustal
|
|
24
|
+
|
|
25
|
+
=============================================================================
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def msa2fasta(msa_object, fasta_out):
|
|
30
|
+
|
|
31
|
+
with open(fasta_out, 'w') as fasta_out_handle:
|
|
32
|
+
for each_seq in msa_object:
|
|
33
|
+
fasta_out_handle.write('>%s\n' % each_seq.id)
|
|
34
|
+
fasta_out_handle.write('%s\n' % str(each_seq.seq))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def msa2phylip(msa_object, phylip_out):
|
|
38
|
+
|
|
39
|
+
max_seq_id_len = 0
|
|
40
|
+
for each_seq in msa_object:
|
|
41
|
+
seq_id_len = len(each_seq.id)
|
|
42
|
+
if seq_id_len > max_seq_id_len:
|
|
43
|
+
max_seq_id_len = seq_id_len
|
|
44
|
+
|
|
45
|
+
with open(phylip_out, 'w') as phylip_out_handle:
|
|
46
|
+
phylip_out_handle.write('%s %s\n' % (len(msa_object), msa_object.get_alignment_length()))
|
|
47
|
+
for each_seq in msa_object:
|
|
48
|
+
seq_id = each_seq.id
|
|
49
|
+
seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
|
|
50
|
+
phylip_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def SliceMSA(args):
|
|
54
|
+
|
|
55
|
+
msa_in_file = args['i']
|
|
56
|
+
aln_in_format = args['fi']
|
|
57
|
+
col_to_select_txt = args['s']
|
|
58
|
+
op_dir = args['o']
|
|
59
|
+
aln_out_format = args['fo']
|
|
60
|
+
force_overwriting = args['force']
|
|
61
|
+
|
|
62
|
+
aln_out_ext = 'fasta'
|
|
63
|
+
if aln_out_format == 'phylip-relaxed':
|
|
64
|
+
aln_out_ext = 'phylip'
|
|
65
|
+
|
|
66
|
+
if os.path.isfile(msa_in_file) is False:
|
|
67
|
+
print('Input MSA not found, program exited!')
|
|
68
|
+
exit()
|
|
69
|
+
|
|
70
|
+
# read in msa
|
|
71
|
+
msa_in = AlignIO.read(msa_in_file, aln_in_format)
|
|
72
|
+
|
|
73
|
+
# parse provided sections
|
|
74
|
+
section_to_select_list = []
|
|
75
|
+
if os.path.isfile(col_to_select_txt) is False:
|
|
76
|
+
col_to_select_txt_split = col_to_select_txt.strip().split('-')
|
|
77
|
+
if col_to_select_txt == '-':
|
|
78
|
+
section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
|
|
79
|
+
elif col_to_select_txt.startswith('-'):
|
|
80
|
+
section_to_select_list.append(['1', col_to_select_txt_split[1]])
|
|
81
|
+
elif col_to_select_txt.endswith('-'):
|
|
82
|
+
section_to_select_list.append([col_to_select_txt_split[0], str(msa_in.get_alignment_length())])
|
|
83
|
+
else:
|
|
84
|
+
section_to_select_list.append(col_to_select_txt_split)
|
|
85
|
+
else:
|
|
86
|
+
for each_section in open(col_to_select_txt):
|
|
87
|
+
each_section = each_section.strip()
|
|
88
|
+
each_section_split = each_section.strip().split('-')
|
|
89
|
+
if each_section == '-':
|
|
90
|
+
section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
|
|
91
|
+
elif each_section.startswith('-'):
|
|
92
|
+
section_to_select_list.append(['1', each_section_split[1]])
|
|
93
|
+
elif each_section.endswith('-'):
|
|
94
|
+
section_to_select_list.append([each_section_split[0], str(msa_in.get_alignment_length())])
|
|
95
|
+
else:
|
|
96
|
+
section_to_select_list.append(each_section_split)
|
|
97
|
+
|
|
98
|
+
# check output folder
|
|
99
|
+
if len(section_to_select_list) > 1:
|
|
100
|
+
if os.path.isdir(op_dir) is True:
|
|
101
|
+
if force_overwriting is True:
|
|
102
|
+
os.system('rm -r %s' % op_dir)
|
|
103
|
+
else:
|
|
104
|
+
print('Output folder already exist, program exited!')
|
|
105
|
+
exit()
|
|
106
|
+
os.system('mkdir %s' % op_dir)
|
|
107
|
+
|
|
108
|
+
# write out sections
|
|
109
|
+
if len(section_to_select_list) == 1:
|
|
110
|
+
current_section = msa_in[:, (int(section_to_select_list[0][0]) - 1):(int(section_to_select_list[0][1]))]
|
|
111
|
+
if aln_out_ext == 'fasta':
|
|
112
|
+
msa2fasta(current_section, op_dir)
|
|
113
|
+
if aln_out_ext == 'phylip':
|
|
114
|
+
msa2phylip(current_section, op_dir)
|
|
115
|
+
else:
|
|
116
|
+
for each_section in section_to_select_list:
|
|
117
|
+
|
|
118
|
+
pwd_op_file = '%s/%s.%s' % (op_dir, '-'.join(each_section), aln_out_ext)
|
|
119
|
+
current_section = msa_in[:, (int(each_section[0])-1):(int(each_section[1]))]
|
|
120
|
+
|
|
121
|
+
# write out
|
|
122
|
+
if aln_out_ext == 'fasta':
|
|
123
|
+
msa2fasta(current_section, pwd_op_file)
|
|
124
|
+
if aln_out_ext == 'phylip':
|
|
125
|
+
msa2phylip(current_section, pwd_op_file)
|
|
126
|
+
|
|
127
|
+
print('MSA subset(s) exported to %s, Done!' % op_dir)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if __name__ == '__main__':
|
|
131
|
+
|
|
132
|
+
# arguments for rename_seq_parser
|
|
133
|
+
SliceMSA_parser = argparse.ArgumentParser()
|
|
134
|
+
SliceMSA_parser.add_argument('-i', required=True, help='input MSA in fasta format')
|
|
135
|
+
SliceMSA_parser.add_argument('-fi', required=False, default='fasta', help='format (NOT file extension) of input MSA, default: fasta')
|
|
136
|
+
SliceMSA_parser.add_argument('-s', required=True, help='columns to export, e.g. 200-300, -100, 50-')
|
|
137
|
+
SliceMSA_parser.add_argument('-o', required=True, help='output file or folder')
|
|
138
|
+
SliceMSA_parser.add_argument('-fo', required=False, default='fasta', help='format of output MSA, select from fasta and phylip-relaxed, default: fasta')
|
|
139
|
+
SliceMSA_parser.add_argument('-force', required=False, action="store_true", help='force overwrite existing output folder')
|
|
140
|
+
args = vars(SliceMSA_parser.parse_args())
|
|
141
|
+
SliceMSA(args)
|
|
142
|
+
|