treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import dendropy
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
RootTreeGTDB220_usage = '''
|
|
8
|
+
========================================== RootTreeGTDB220 example command ==========================================
|
|
9
|
+
|
|
10
|
+
TreeSAK RootTreeGTDB220 -add_root -d ar -tree ar53.tree -tax ar53.summary.tsv -db db_dir -o ar53.rooted.tree
|
|
11
|
+
TreeSAK RootTreeGTDB220 -add_root -d bac -tree bac120.tree -tax bac120.summary.tsv -db db_dir -o bac120.rooted.tree
|
|
12
|
+
|
|
13
|
+
# Need to download and decompress the following files to your database folder (provide with -db)
|
|
14
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_r220.tree.tar.gz
|
|
15
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_r220.tree.tar.gz
|
|
16
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_metadata_r220.tsv.gz
|
|
17
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_metadata_r220.tsv.gz
|
|
18
|
+
|
|
19
|
+
=====================================================================================================================
|
|
20
|
+
'''
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_smallest_outgroup(tree_object):
|
|
24
|
+
|
|
25
|
+
min_outgroup_leaf_num = 99999
|
|
26
|
+
for each_root_child in tree_object.children:
|
|
27
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
28
|
+
if len(leaf_list) < min_outgroup_leaf_num:
|
|
29
|
+
min_outgroup_leaf_num = len(leaf_list)
|
|
30
|
+
|
|
31
|
+
out_group_leaf_list = []
|
|
32
|
+
for each_root_child in tree_object.children:
|
|
33
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
34
|
+
if len(leaf_list) == min_outgroup_leaf_num:
|
|
35
|
+
out_group_leaf_list = leaf_list
|
|
36
|
+
|
|
37
|
+
return out_group_leaf_list
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sep_taxon_str(taxon_string):
|
|
41
|
+
|
|
42
|
+
taxon_string_split = taxon_string.strip().split(';')
|
|
43
|
+
taxon_p = taxon_string_split[1]
|
|
44
|
+
taxon_c = taxon_string_split[2]
|
|
45
|
+
taxon_o = taxon_string_split[3]
|
|
46
|
+
taxon_f = taxon_string_split[4]
|
|
47
|
+
taxon_g = taxon_string_split[5]
|
|
48
|
+
|
|
49
|
+
return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
|
|
53
|
+
|
|
54
|
+
input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
|
|
55
|
+
|
|
56
|
+
# subset tree
|
|
57
|
+
subset_tree = input_tree.copy()
|
|
58
|
+
subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
|
|
59
|
+
|
|
60
|
+
# rename leaf
|
|
61
|
+
for each_leaf in subset_tree:
|
|
62
|
+
leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
|
|
63
|
+
each_leaf.name = leaf_name_new
|
|
64
|
+
|
|
65
|
+
return subset_tree
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
|
|
69
|
+
|
|
70
|
+
"""
|
|
71
|
+
Reroot the tree using the given outgroup.
|
|
72
|
+
modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
|
|
73
|
+
|
|
74
|
+
input_tree: File containing Newick tree to rerooted.
|
|
75
|
+
output_tree: Name of file for rerooted tree.
|
|
76
|
+
outgroup: Labels of taxa in outgroup.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
|
|
80
|
+
|
|
81
|
+
outgroup_in_tree = set()
|
|
82
|
+
ingroup_leaves = set()
|
|
83
|
+
for n in tree.leaf_node_iter():
|
|
84
|
+
if n.taxon.label in out_group_list:
|
|
85
|
+
outgroup_in_tree.add(n.taxon)
|
|
86
|
+
else:
|
|
87
|
+
ingroup_leaves.add(n)
|
|
88
|
+
|
|
89
|
+
# Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
|
|
90
|
+
# ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
|
|
91
|
+
# outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
|
|
92
|
+
# polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
|
|
93
|
+
# lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
|
|
94
|
+
# this is not guaranteed.
|
|
95
|
+
|
|
96
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
97
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
98
|
+
while True:
|
|
99
|
+
rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
|
|
100
|
+
tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
|
|
101
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
102
|
+
if len(mrca.leaf_nodes()) == mrca_leaves:
|
|
103
|
+
break
|
|
104
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
105
|
+
|
|
106
|
+
if mrca.edge_length is not None:
|
|
107
|
+
tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
|
|
108
|
+
|
|
109
|
+
# tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
110
|
+
tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
111
|
+
tree_out_string = tree_out_string.replace("'", "")
|
|
112
|
+
|
|
113
|
+
# add the root bar
|
|
114
|
+
if add_root_branch is True:
|
|
115
|
+
tree_out_string = '(' + tree_out_string
|
|
116
|
+
tree_out_string = tree_out_string.replace(');', '):0.02);')
|
|
117
|
+
|
|
118
|
+
# write out tree string
|
|
119
|
+
tree_file_rooted_handle = open(tree_file_rooted, 'w')
|
|
120
|
+
tree_file_rooted_handle.write(tree_out_string)
|
|
121
|
+
tree_file_rooted_handle.close()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def RootTreeGTDB220(args):
|
|
125
|
+
|
|
126
|
+
input_unrooted_tree = args['tree']
|
|
127
|
+
user_gnm_taxon = args['tax']
|
|
128
|
+
db_dir = args['db']
|
|
129
|
+
gnm_domain = args['d']
|
|
130
|
+
add_root_branch = args['add_root']
|
|
131
|
+
rooted_tree = args['o']
|
|
132
|
+
|
|
133
|
+
leaf_list = []
|
|
134
|
+
for leaf in Tree(input_unrooted_tree, format=1):
|
|
135
|
+
leaf_name = leaf.name
|
|
136
|
+
leaf_list.append(leaf_name)
|
|
137
|
+
|
|
138
|
+
# define file name
|
|
139
|
+
gtdb_ref_tree_ar = '%s/ar53_r220.tree' % db_dir
|
|
140
|
+
gtdb_ref_tree_bac = '%s/bac120_r220.tree' % db_dir
|
|
141
|
+
gtdb_gnm_meta_ar = '%s/ar53_metadata_r220.tsv' % db_dir
|
|
142
|
+
gtdb_gnm_meta_bac = '%s/bac120_metadata_r220.tsv' % db_dir
|
|
143
|
+
|
|
144
|
+
if gnm_domain == 'bac':
|
|
145
|
+
gtdb_ref_tree = gtdb_ref_tree_bac
|
|
146
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_bac
|
|
147
|
+
elif gnm_domain == 'ar':
|
|
148
|
+
gtdb_ref_tree = gtdb_ref_tree_ar
|
|
149
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_ar
|
|
150
|
+
else:
|
|
151
|
+
print('please provide either "ar" or "bac" to -d')
|
|
152
|
+
exit()
|
|
153
|
+
|
|
154
|
+
tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
|
|
155
|
+
ref_tree_gnm_list = tree.get_leaf_names()
|
|
156
|
+
ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
|
|
157
|
+
|
|
158
|
+
# read in user_gnm_taxon
|
|
159
|
+
user_gnm_taxon_dict_p = dict()
|
|
160
|
+
user_gnm_taxon_dict_c = dict()
|
|
161
|
+
user_gnm_taxon_dict_o = dict()
|
|
162
|
+
user_gnm_taxon_dict_f = dict()
|
|
163
|
+
user_gnm_taxon_dict_g = dict()
|
|
164
|
+
for each_gnm in open(user_gnm_taxon):
|
|
165
|
+
if not each_gnm.startswith('user_genome\t'):
|
|
166
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
167
|
+
gnm_id = each_gnm_split[0]
|
|
168
|
+
gnm_taxon = each_gnm_split[1]
|
|
169
|
+
|
|
170
|
+
if gnm_id in leaf_list:
|
|
171
|
+
count_current_gnm = False
|
|
172
|
+
if gnm_domain == 'bac':
|
|
173
|
+
if 'd__Bacteria' in gnm_taxon:
|
|
174
|
+
count_current_gnm = True
|
|
175
|
+
elif gnm_domain == 'ar':
|
|
176
|
+
if 'd__Archaea' in gnm_taxon:
|
|
177
|
+
count_current_gnm = True
|
|
178
|
+
|
|
179
|
+
if count_current_gnm is True:
|
|
180
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
|
|
181
|
+
|
|
182
|
+
if gnm_p not in user_gnm_taxon_dict_p:
|
|
183
|
+
user_gnm_taxon_dict_p[gnm_p] = set()
|
|
184
|
+
if gnm_c not in user_gnm_taxon_dict_c:
|
|
185
|
+
user_gnm_taxon_dict_c[gnm_c] = set()
|
|
186
|
+
if gnm_o not in user_gnm_taxon_dict_o:
|
|
187
|
+
user_gnm_taxon_dict_o[gnm_o] = set()
|
|
188
|
+
if gnm_f not in user_gnm_taxon_dict_f:
|
|
189
|
+
user_gnm_taxon_dict_f[gnm_f] = set()
|
|
190
|
+
if gnm_g not in user_gnm_taxon_dict_g:
|
|
191
|
+
user_gnm_taxon_dict_g[gnm_g] = set()
|
|
192
|
+
|
|
193
|
+
user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
|
|
194
|
+
user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
|
|
195
|
+
user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
|
|
196
|
+
user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
|
|
197
|
+
user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
|
|
198
|
+
|
|
199
|
+
# determine rooting rank, start from phylum
|
|
200
|
+
rooting_rank = ''
|
|
201
|
+
rooting_rank_taxon_dict = dict()
|
|
202
|
+
if len(user_gnm_taxon_dict_p) > 1:
|
|
203
|
+
rooting_rank = 'p'
|
|
204
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_p
|
|
205
|
+
elif len(user_gnm_taxon_dict_c) > 1:
|
|
206
|
+
rooting_rank = 'c'
|
|
207
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_c
|
|
208
|
+
elif len(user_gnm_taxon_dict_o) > 1:
|
|
209
|
+
rooting_rank = 'o'
|
|
210
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_o
|
|
211
|
+
elif len(user_gnm_taxon_dict_f) > 1:
|
|
212
|
+
rooting_rank = 'f'
|
|
213
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_f
|
|
214
|
+
elif len(user_gnm_taxon_dict_g) > 1:
|
|
215
|
+
rooting_rank = 'g'
|
|
216
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_g
|
|
217
|
+
|
|
218
|
+
if rooting_rank == '':
|
|
219
|
+
print('All user genomes are from the same genus, program exited!')
|
|
220
|
+
exit()
|
|
221
|
+
|
|
222
|
+
col_index = {}
|
|
223
|
+
canditate_gnms_rooting_rank = dict()
|
|
224
|
+
counted_taxons_rooting_rank = set()
|
|
225
|
+
for each_ref in open(gtdb_gnm_metadata):
|
|
226
|
+
each_ref_split = each_ref.strip().split('\t')
|
|
227
|
+
if each_ref.startswith('accession ambiguous_bases'):
|
|
228
|
+
col_index = {key: i for i, key in enumerate(each_ref_split)}
|
|
229
|
+
else:
|
|
230
|
+
ref_accession = each_ref_split[0]
|
|
231
|
+
gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
|
|
232
|
+
if ref_accession in ref_tree_gnm_set:
|
|
233
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
|
|
234
|
+
|
|
235
|
+
gnm_rooting_rank = ''
|
|
236
|
+
if rooting_rank == 'p':
|
|
237
|
+
gnm_rooting_rank = gnm_p
|
|
238
|
+
elif rooting_rank == 'c':
|
|
239
|
+
gnm_rooting_rank = gnm_c
|
|
240
|
+
elif rooting_rank == 'o':
|
|
241
|
+
gnm_rooting_rank = gnm_o
|
|
242
|
+
elif rooting_rank == 'f':
|
|
243
|
+
gnm_rooting_rank = gnm_f
|
|
244
|
+
elif rooting_rank == 'g':
|
|
245
|
+
gnm_rooting_rank = gnm_g
|
|
246
|
+
|
|
247
|
+
# rooting_rank
|
|
248
|
+
if gnm_rooting_rank in rooting_rank_taxon_dict:
|
|
249
|
+
if gnm_rooting_rank not in counted_taxons_rooting_rank:
|
|
250
|
+
counted_taxons_rooting_rank.add(gnm_rooting_rank)
|
|
251
|
+
canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
|
|
252
|
+
|
|
253
|
+
ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
|
|
254
|
+
|
|
255
|
+
# get the smallest out group taxon set
|
|
256
|
+
smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
|
|
257
|
+
|
|
258
|
+
user_gnm_taxon_dict_rooting_rank = dict()
|
|
259
|
+
if rooting_rank == 'p':
|
|
260
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
|
|
261
|
+
elif rooting_rank == 'c':
|
|
262
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
|
|
263
|
+
elif rooting_rank == 'o':
|
|
264
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
|
|
265
|
+
elif rooting_rank == 'f':
|
|
266
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
|
|
267
|
+
elif rooting_rank == 'g':
|
|
268
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
|
|
269
|
+
|
|
270
|
+
# get the smallest out group genome set
|
|
271
|
+
out_group_gnm_set_1 = set()
|
|
272
|
+
out_group_gnm_set_2 = set()
|
|
273
|
+
for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
|
|
274
|
+
gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
|
|
275
|
+
if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
|
|
276
|
+
out_group_gnm_set_1.update(gnm_member_set)
|
|
277
|
+
else:
|
|
278
|
+
out_group_gnm_set_2.update(gnm_member_set)
|
|
279
|
+
|
|
280
|
+
# select the smaller one as outgroup
|
|
281
|
+
if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
|
|
282
|
+
out_group_gnm_set = out_group_gnm_set_1
|
|
283
|
+
else:
|
|
284
|
+
out_group_gnm_set = out_group_gnm_set_2
|
|
285
|
+
|
|
286
|
+
# root user tree with identified out group genomes
|
|
287
|
+
root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == '__main__':
|
|
291
|
+
|
|
292
|
+
RootTreeGTDB220_parser = argparse.ArgumentParser(usage=RootTreeGTDB220_usage)
|
|
293
|
+
RootTreeGTDB220_parser.add_argument('-tree', required=True, help='input unrooted tree')
|
|
294
|
+
RootTreeGTDB220_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
|
|
295
|
+
RootTreeGTDB220_parser.add_argument('-db', required=True, help='GTDB database files')
|
|
296
|
+
RootTreeGTDB220_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
|
|
297
|
+
RootTreeGTDB220_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
|
|
298
|
+
RootTreeGTDB220_parser.add_argument('-o', required=True, help='output folder')
|
|
299
|
+
args = vars(RootTreeGTDB220_parser.parse_args())
|
|
300
|
+
RootTreeGTDB220(args)
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import dendropy
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
RootTreeGTDB226_usage = '''
|
|
8
|
+
========================================== RootTreeGTDB226 example command ==========================================
|
|
9
|
+
|
|
10
|
+
TreeSAK RootTreeGTDB226 -add_root -d ar -tree ar53.tree -tax ar53.summary.tsv -db db_dir -o ar53.rooted.tree
|
|
11
|
+
TreeSAK RootTreeGTDB226 -add_root -d bac -tree bac120.tree -tax bac120.summary.tsv -db db_dir -o bac120.rooted.tree
|
|
12
|
+
|
|
13
|
+
# Need to download and decompress the following files to your database folder (provide with -db)
|
|
14
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/ar53_r226.tree.tar.gz
|
|
15
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/bac120_r226.tree.tar.gz
|
|
16
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/ar53_metadata_r226.tsv.gz
|
|
17
|
+
https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/bac120_metadata_r226.tsv.gz
|
|
18
|
+
|
|
19
|
+
=====================================================================================================================
|
|
20
|
+
'''
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_smallest_outgroup(tree_object):
|
|
24
|
+
|
|
25
|
+
min_outgroup_leaf_num = 99999
|
|
26
|
+
for each_root_child in tree_object.children:
|
|
27
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
28
|
+
if len(leaf_list) < min_outgroup_leaf_num:
|
|
29
|
+
min_outgroup_leaf_num = len(leaf_list)
|
|
30
|
+
|
|
31
|
+
out_group_leaf_list = []
|
|
32
|
+
for each_root_child in tree_object.children:
|
|
33
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
34
|
+
if len(leaf_list) == min_outgroup_leaf_num:
|
|
35
|
+
out_group_leaf_list = leaf_list
|
|
36
|
+
|
|
37
|
+
return out_group_leaf_list
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sep_taxon_str(taxon_string):
|
|
41
|
+
|
|
42
|
+
taxon_string_split = taxon_string.strip().split(';')
|
|
43
|
+
taxon_p = taxon_string_split[1]
|
|
44
|
+
taxon_c = taxon_string_split[2]
|
|
45
|
+
taxon_o = taxon_string_split[3]
|
|
46
|
+
taxon_f = taxon_string_split[4]
|
|
47
|
+
taxon_g = taxon_string_split[5]
|
|
48
|
+
|
|
49
|
+
return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
|
|
53
|
+
|
|
54
|
+
input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
|
|
55
|
+
|
|
56
|
+
# subset tree
|
|
57
|
+
subset_tree = input_tree.copy()
|
|
58
|
+
subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
|
|
59
|
+
|
|
60
|
+
# rename leaf
|
|
61
|
+
for each_leaf in subset_tree:
|
|
62
|
+
leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
|
|
63
|
+
each_leaf.name = leaf_name_new
|
|
64
|
+
|
|
65
|
+
return subset_tree
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
|
|
69
|
+
|
|
70
|
+
"""
|
|
71
|
+
Reroot the tree using the given outgroup.
|
|
72
|
+
modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
|
|
73
|
+
|
|
74
|
+
input_tree: File containing Newick tree to rerooted.
|
|
75
|
+
output_tree: Name of file for rerooted tree.
|
|
76
|
+
outgroup: Labels of taxa in outgroup.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
|
|
80
|
+
|
|
81
|
+
outgroup_in_tree = set()
|
|
82
|
+
ingroup_leaves = set()
|
|
83
|
+
for n in tree.leaf_node_iter():
|
|
84
|
+
if n.taxon.label in out_group_list:
|
|
85
|
+
outgroup_in_tree.add(n.taxon)
|
|
86
|
+
else:
|
|
87
|
+
ingroup_leaves.add(n)
|
|
88
|
+
|
|
89
|
+
# Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
|
|
90
|
+
# ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
|
|
91
|
+
# outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
|
|
92
|
+
# polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
|
|
93
|
+
# lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
|
|
94
|
+
# this is not guaranteed.
|
|
95
|
+
|
|
96
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
97
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
98
|
+
while True:
|
|
99
|
+
rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
|
|
100
|
+
tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
|
|
101
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
102
|
+
if len(mrca.leaf_nodes()) == mrca_leaves:
|
|
103
|
+
break
|
|
104
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
105
|
+
|
|
106
|
+
if mrca.edge_length is not None:
|
|
107
|
+
tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
|
|
108
|
+
|
|
109
|
+
# tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
110
|
+
tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
111
|
+
tree_out_string = tree_out_string.replace("'", "")
|
|
112
|
+
|
|
113
|
+
# add the root bar
|
|
114
|
+
if add_root_branch is True:
|
|
115
|
+
tree_out_string = '(' + tree_out_string
|
|
116
|
+
tree_out_string = tree_out_string.replace(');', '):0.02);')
|
|
117
|
+
|
|
118
|
+
# write out tree string
|
|
119
|
+
tree_file_rooted_handle = open(tree_file_rooted, 'w')
|
|
120
|
+
tree_file_rooted_handle.write(tree_out_string)
|
|
121
|
+
tree_file_rooted_handle.close()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def RootTreeGTDB226(args):
|
|
125
|
+
|
|
126
|
+
input_unrooted_tree = args['tree']
|
|
127
|
+
user_gnm_taxon = args['tax']
|
|
128
|
+
db_dir = args['db']
|
|
129
|
+
gnm_domain = args['d']
|
|
130
|
+
add_root_branch = args['add_root']
|
|
131
|
+
rooted_tree = args['o']
|
|
132
|
+
|
|
133
|
+
leaf_list = []
|
|
134
|
+
for leaf in Tree(input_unrooted_tree, format=1):
|
|
135
|
+
leaf_name = leaf.name
|
|
136
|
+
leaf_list.append(leaf_name)
|
|
137
|
+
|
|
138
|
+
# define file name
|
|
139
|
+
gtdb_ref_tree_ar = '%s/ar53_r226.tree' % db_dir
|
|
140
|
+
gtdb_ref_tree_bac = '%s/bac120_r226.tree' % db_dir
|
|
141
|
+
gtdb_gnm_meta_ar = '%s/ar53_metadata_r226.tsv' % db_dir
|
|
142
|
+
gtdb_gnm_meta_bac = '%s/bac120_metadata_r226.tsv' % db_dir
|
|
143
|
+
|
|
144
|
+
if gnm_domain == 'bac':
|
|
145
|
+
gtdb_ref_tree = gtdb_ref_tree_bac
|
|
146
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_bac
|
|
147
|
+
elif gnm_domain == 'ar':
|
|
148
|
+
gtdb_ref_tree = gtdb_ref_tree_ar
|
|
149
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_ar
|
|
150
|
+
else:
|
|
151
|
+
print('please provide either "ar" or "bac" to -d')
|
|
152
|
+
exit()
|
|
153
|
+
|
|
154
|
+
tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
|
|
155
|
+
ref_tree_gnm_list = tree.get_leaf_names()
|
|
156
|
+
ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
|
|
157
|
+
|
|
158
|
+
# read in user_gnm_taxon
|
|
159
|
+
user_gnm_taxon_dict_p = dict()
|
|
160
|
+
user_gnm_taxon_dict_c = dict()
|
|
161
|
+
user_gnm_taxon_dict_o = dict()
|
|
162
|
+
user_gnm_taxon_dict_f = dict()
|
|
163
|
+
user_gnm_taxon_dict_g = dict()
|
|
164
|
+
for each_gnm in open(user_gnm_taxon):
|
|
165
|
+
if not each_gnm.startswith('user_genome\t'):
|
|
166
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
167
|
+
gnm_id = each_gnm_split[0]
|
|
168
|
+
gnm_taxon = each_gnm_split[1]
|
|
169
|
+
|
|
170
|
+
if gnm_id in leaf_list:
|
|
171
|
+
count_current_gnm = False
|
|
172
|
+
if gnm_domain == 'bac':
|
|
173
|
+
if 'd__Bacteria' in gnm_taxon:
|
|
174
|
+
count_current_gnm = True
|
|
175
|
+
elif gnm_domain == 'ar':
|
|
176
|
+
if 'd__Archaea' in gnm_taxon:
|
|
177
|
+
count_current_gnm = True
|
|
178
|
+
|
|
179
|
+
if count_current_gnm is True:
|
|
180
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
|
|
181
|
+
|
|
182
|
+
if gnm_p not in user_gnm_taxon_dict_p:
|
|
183
|
+
user_gnm_taxon_dict_p[gnm_p] = set()
|
|
184
|
+
if gnm_c not in user_gnm_taxon_dict_c:
|
|
185
|
+
user_gnm_taxon_dict_c[gnm_c] = set()
|
|
186
|
+
if gnm_o not in user_gnm_taxon_dict_o:
|
|
187
|
+
user_gnm_taxon_dict_o[gnm_o] = set()
|
|
188
|
+
if gnm_f not in user_gnm_taxon_dict_f:
|
|
189
|
+
user_gnm_taxon_dict_f[gnm_f] = set()
|
|
190
|
+
if gnm_g not in user_gnm_taxon_dict_g:
|
|
191
|
+
user_gnm_taxon_dict_g[gnm_g] = set()
|
|
192
|
+
|
|
193
|
+
user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
|
|
194
|
+
user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
|
|
195
|
+
user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
|
|
196
|
+
user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
|
|
197
|
+
user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
|
|
198
|
+
|
|
199
|
+
# determine rooting rank, start from phylum
|
|
200
|
+
rooting_rank = ''
|
|
201
|
+
rooting_rank_taxon_dict = dict()
|
|
202
|
+
if len(user_gnm_taxon_dict_p) > 1:
|
|
203
|
+
rooting_rank = 'p'
|
|
204
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_p
|
|
205
|
+
elif len(user_gnm_taxon_dict_c) > 1:
|
|
206
|
+
rooting_rank = 'c'
|
|
207
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_c
|
|
208
|
+
elif len(user_gnm_taxon_dict_o) > 1:
|
|
209
|
+
rooting_rank = 'o'
|
|
210
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_o
|
|
211
|
+
elif len(user_gnm_taxon_dict_f) > 1:
|
|
212
|
+
rooting_rank = 'f'
|
|
213
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_f
|
|
214
|
+
elif len(user_gnm_taxon_dict_g) > 1:
|
|
215
|
+
rooting_rank = 'g'
|
|
216
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_g
|
|
217
|
+
|
|
218
|
+
if rooting_rank == '':
|
|
219
|
+
print('All user genomes are from the same genus, program exited!')
|
|
220
|
+
exit()
|
|
221
|
+
|
|
222
|
+
col_index = {}
|
|
223
|
+
canditate_gnms_rooting_rank = dict()
|
|
224
|
+
counted_taxons_rooting_rank = set()
|
|
225
|
+
for each_ref in open(gtdb_gnm_metadata):
|
|
226
|
+
each_ref_split = each_ref.strip().split('\t')
|
|
227
|
+
if each_ref.startswith('accession ambiguous_bases'):
|
|
228
|
+
col_index = {key: i for i, key in enumerate(each_ref_split)}
|
|
229
|
+
else:
|
|
230
|
+
ref_accession = each_ref_split[0]
|
|
231
|
+
gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
|
|
232
|
+
if ref_accession in ref_tree_gnm_set:
|
|
233
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
|
|
234
|
+
|
|
235
|
+
gnm_rooting_rank = ''
|
|
236
|
+
if rooting_rank == 'p':
|
|
237
|
+
gnm_rooting_rank = gnm_p
|
|
238
|
+
elif rooting_rank == 'c':
|
|
239
|
+
gnm_rooting_rank = gnm_c
|
|
240
|
+
elif rooting_rank == 'o':
|
|
241
|
+
gnm_rooting_rank = gnm_o
|
|
242
|
+
elif rooting_rank == 'f':
|
|
243
|
+
gnm_rooting_rank = gnm_f
|
|
244
|
+
elif rooting_rank == 'g':
|
|
245
|
+
gnm_rooting_rank = gnm_g
|
|
246
|
+
|
|
247
|
+
# rooting_rank
|
|
248
|
+
if gnm_rooting_rank in rooting_rank_taxon_dict:
|
|
249
|
+
if gnm_rooting_rank not in counted_taxons_rooting_rank:
|
|
250
|
+
counted_taxons_rooting_rank.add(gnm_rooting_rank)
|
|
251
|
+
canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
|
|
252
|
+
|
|
253
|
+
ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
|
|
254
|
+
|
|
255
|
+
# get the smallest out group taxon set
|
|
256
|
+
smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
|
|
257
|
+
|
|
258
|
+
user_gnm_taxon_dict_rooting_rank = dict()
|
|
259
|
+
if rooting_rank == 'p':
|
|
260
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
|
|
261
|
+
elif rooting_rank == 'c':
|
|
262
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
|
|
263
|
+
elif rooting_rank == 'o':
|
|
264
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
|
|
265
|
+
elif rooting_rank == 'f':
|
|
266
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
|
|
267
|
+
elif rooting_rank == 'g':
|
|
268
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
|
|
269
|
+
|
|
270
|
+
# get the smallest out group genome set
|
|
271
|
+
out_group_gnm_set_1 = set()
|
|
272
|
+
out_group_gnm_set_2 = set()
|
|
273
|
+
for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
|
|
274
|
+
gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
|
|
275
|
+
if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
|
|
276
|
+
out_group_gnm_set_1.update(gnm_member_set)
|
|
277
|
+
else:
|
|
278
|
+
out_group_gnm_set_2.update(gnm_member_set)
|
|
279
|
+
|
|
280
|
+
# select the smaller one as outgroup
|
|
281
|
+
if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
|
|
282
|
+
out_group_gnm_set = out_group_gnm_set_1
|
|
283
|
+
else:
|
|
284
|
+
out_group_gnm_set = out_group_gnm_set_2
|
|
285
|
+
|
|
286
|
+
# root user tree with identified out group genomes
|
|
287
|
+
root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == '__main__':
|
|
291
|
+
|
|
292
|
+
RootTreeGTDB226_parser = argparse.ArgumentParser(usage=RootTreeGTDB226_usage)
|
|
293
|
+
RootTreeGTDB226_parser.add_argument('-tree', required=True, help='input unrooted tree')
|
|
294
|
+
RootTreeGTDB226_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
|
|
295
|
+
RootTreeGTDB226_parser.add_argument('-db', required=True, help='GTDB database files')
|
|
296
|
+
RootTreeGTDB226_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
|
|
297
|
+
RootTreeGTDB226_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
|
|
298
|
+
RootTreeGTDB226_parser.add_argument('-o', required=True, help='output folder')
|
|
299
|
+
args = vars(RootTreeGTDB226_parser.parse_args())
|
|
300
|
+
RootTreeGTDB226(args)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
SequentialDating_usage = '''
|
|
3
|
+
======================== SequentialDating example commands ========================
|
|
4
|
+
|
|
5
|
+
TreeSAK SequentialDating -h
|
|
6
|
+
|
|
7
|
+
===================================================================================
|
|
8
|
+
'''
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def SequentialDating():
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SequentialDating()
|