treesak 1.51.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of treesak might be problematic. Click here for more details.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +130 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +290 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +106 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/RootTreeGTDB226.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +19 -0
- TreeSAK/SplitScore1.py +178 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +597 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +158 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +77 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.51.2.data/scripts/TreeSAK +950 -0
- treesak-1.51.2.dist-info/LICENSE +674 -0
- treesak-1.51.2.dist-info/METADATA +27 -0
- treesak-1.51.2.dist-info/RECORD +125 -0
- treesak-1.51.2.dist-info/WHEEL +5 -0
- treesak-1.51.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from ete3 import Tree
|
|
6
|
+
import plotly.express as px
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PlotMcmcNode_usage = '''
|
|
10
|
+
============================ PlotMcmcNode example commands ============================
|
|
11
|
+
|
|
12
|
+
TreeSAK PlotMcmcNode -i McmcTree_op_files -n n179 -o Clock2_n179.pdf
|
|
13
|
+
TreeSAK PlotMcmcNode -i McmcTree_op_files -n n161,n186 -o Clock3_n161_n186.pdf
|
|
14
|
+
TreeSAK PlotMcmcNode -i McmcTree_op_files -n nodes.txt -o multi_runs_multi_nodes.pdf
|
|
15
|
+
|
|
16
|
+
# File name of the mcmc.txt and the corresponding mcmc out file need to follow
|
|
17
|
+
# the rule as specified below:
|
|
18
|
+
[setting_1]_mcmc.txt
|
|
19
|
+
[setting_1]_out.txt
|
|
20
|
+
[setting_2]_mcmc.txt
|
|
21
|
+
[setting_2]_out.txt
|
|
22
|
+
|
|
23
|
+
# file format (-n, tab separated)
|
|
24
|
+
# leave the 2nd column blank for nodes without renaming
|
|
25
|
+
setting_1 node1 Bacteria
|
|
26
|
+
setting_2 node2
|
|
27
|
+
setting_3 node3,node9 Archaea
|
|
28
|
+
|
|
29
|
+
# Y-axis label file format (-l, tab separated)
|
|
30
|
+
PA_75_DeltaLL_50_clock3_mcmc.txt DeltaLL_50
|
|
31
|
+
PA_75_DeltaLL_75_clock3_mcmc.txt DeltaLL_75
|
|
32
|
+
|
|
33
|
+
=======================================================================================
|
|
34
|
+
'''
|
|
35
|
+
|
|
36
|
+
def sep_path_basename_ext(file_in):
|
|
37
|
+
|
|
38
|
+
# separate path and file name
|
|
39
|
+
f_path, file_name = os.path.split(file_in)
|
|
40
|
+
if f_path == '':
|
|
41
|
+
f_path = '.'
|
|
42
|
+
|
|
43
|
+
# separate file basename and extension
|
|
44
|
+
f_base, f_ext = os.path.splitext(file_name)
|
|
45
|
+
|
|
46
|
+
return f_path, f_base, f_ext
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def mcmctree_out_to_tree_str(mamctree_out):
|
|
50
|
+
|
|
51
|
+
# get tree string from mcmctree_out
|
|
52
|
+
tree_str = ''
|
|
53
|
+
tree_line = 0
|
|
54
|
+
current_line = 1
|
|
55
|
+
for each_line in open(mamctree_out):
|
|
56
|
+
if 'Species tree for FigTree. Branch lengths = posterior mean times; 95% CIs = labels' in each_line:
|
|
57
|
+
tree_line = current_line + 1
|
|
58
|
+
if tree_line == current_line:
|
|
59
|
+
tree_str = each_line.strip()
|
|
60
|
+
current_line += 1
|
|
61
|
+
|
|
62
|
+
tree_str_no_space = tree_str.replace(' ', '')
|
|
63
|
+
|
|
64
|
+
# rename tree nodes
|
|
65
|
+
t = Tree(tree_str_no_space, format=1)
|
|
66
|
+
for each_node in t.traverse():
|
|
67
|
+
if each_node.is_leaf():
|
|
68
|
+
node_name_new = '_'.join(each_node.name.split('_')[1:])
|
|
69
|
+
else:
|
|
70
|
+
node_name_new = 't_n%s' % each_node.name
|
|
71
|
+
each_node.name = node_name_new
|
|
72
|
+
|
|
73
|
+
tree_str_renamed = t.write(format=8)
|
|
74
|
+
|
|
75
|
+
return tree_str_renamed
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def plot_distribution(df_txt, output_plot):
|
|
79
|
+
|
|
80
|
+
df = pd.read_table(df_txt, sep=',')
|
|
81
|
+
run_id_list = df['Setting'].unique()
|
|
82
|
+
node_id_list = df['Node'].unique()
|
|
83
|
+
|
|
84
|
+
# sort dataframe by run id
|
|
85
|
+
df = df.sort_values(by='Setting', ascending=False)
|
|
86
|
+
|
|
87
|
+
plot_width = 900
|
|
88
|
+
plot_height = len(run_id_list)*100
|
|
89
|
+
if plot_height < 360:
|
|
90
|
+
plot_height = 360
|
|
91
|
+
|
|
92
|
+
fig = px.violin(df, x="Value", y="Setting", color="Node", points=False, orientation="h", width=plot_width, height=plot_height)
|
|
93
|
+
if len(node_id_list) == 1:
|
|
94
|
+
fig.update_traces(side="positive", fillcolor='lightblue', width=1.6, opacity=0.75)
|
|
95
|
+
else:
|
|
96
|
+
fig.update_traces(side="positive", fillcolor='rgba(0,0,0,0)', width=1.6)
|
|
97
|
+
|
|
98
|
+
fig.update_traces(showlegend=True)
|
|
99
|
+
fig.layout.template = "simple_white"
|
|
100
|
+
# fig.layout.width = 700
|
|
101
|
+
# fig.layout.height = 750
|
|
102
|
+
# fig.update_xaxes(range=[40, 0])
|
|
103
|
+
# fig.update_layout(margin_t=10, title_text='Demo', title_x=0.5)
|
|
104
|
+
fig.write_image(output_plot)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_internal_node_to_plot(node_txt, mo_file):
|
|
108
|
+
|
|
109
|
+
tree_str = ''
|
|
110
|
+
if os.path.isfile(mo_file):
|
|
111
|
+
tree_str = mcmctree_out_to_tree_str(mo_file)
|
|
112
|
+
|
|
113
|
+
# get nodes to plot
|
|
114
|
+
node_set = set()
|
|
115
|
+
node_rename_dict = dict()
|
|
116
|
+
if os.path.isfile(node_txt) is True:
|
|
117
|
+
for each in open(node_txt):
|
|
118
|
+
each_split = each.strip().split('\t')
|
|
119
|
+
node_str = each_split[0]
|
|
120
|
+
|
|
121
|
+
# get internal_node_to_plot
|
|
122
|
+
internal_node_to_plot = ''
|
|
123
|
+
if ',' not in node_str:
|
|
124
|
+
internal_node_to_plot = each_split[0]
|
|
125
|
+
else:
|
|
126
|
+
leaf_list = node_str.split(',')
|
|
127
|
+
if tree_str == '':
|
|
128
|
+
print('*out.txt file not found, program exited!')
|
|
129
|
+
exit()
|
|
130
|
+
current_lca = Tree(tree_str, format=1).get_common_ancestor(leaf_list)
|
|
131
|
+
internal_node_to_plot = current_lca.name
|
|
132
|
+
|
|
133
|
+
# add internal_node_to_plot to node_set
|
|
134
|
+
if internal_node_to_plot != '':
|
|
135
|
+
node_set.add(internal_node_to_plot)
|
|
136
|
+
|
|
137
|
+
# read in name to show in plot
|
|
138
|
+
if len(each_split) == 2:
|
|
139
|
+
if each_split[1] != '':
|
|
140
|
+
node_rename_dict[internal_node_to_plot] = each_split[1]
|
|
141
|
+
else:
|
|
142
|
+
node_set = node_txt.split(',')
|
|
143
|
+
|
|
144
|
+
return node_set, node_rename_dict, tree_str
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def PlotMcmcNode(args):
|
|
148
|
+
|
|
149
|
+
mcmc_in = args['i']
|
|
150
|
+
node_txt = args['n']
|
|
151
|
+
output_plot = args['o']
|
|
152
|
+
specified_out_file = args['of']
|
|
153
|
+
y_label_txt = args['l']
|
|
154
|
+
keep_tmp_file = args['tmp']
|
|
155
|
+
|
|
156
|
+
# check MCMCTree output file/dir
|
|
157
|
+
if os.path.isfile(mcmc_in) is True:
|
|
158
|
+
mcmc_file_list = [mcmc_in]
|
|
159
|
+
else:
|
|
160
|
+
mcmc_file_re = '%s/*_mcmc.txt' % (mcmc_in)
|
|
161
|
+
mcmc_file_list = glob.glob(mcmc_file_re)
|
|
162
|
+
|
|
163
|
+
if len(mcmc_file_list) == 0:
|
|
164
|
+
print('*mcmc.txt file not found, program exited!')
|
|
165
|
+
exit()
|
|
166
|
+
|
|
167
|
+
if specified_out_file is None:
|
|
168
|
+
missed_out_file_list = []
|
|
169
|
+
for each_mcmc_file in mcmc_file_list:
|
|
170
|
+
pwd_out_file = each_mcmc_file.replace('_mcmc.txt', '_out.txt')
|
|
171
|
+
if os.path.isfile(pwd_out_file) is False:
|
|
172
|
+
missed_out_file_list.append(pwd_out_file)
|
|
173
|
+
if len(missed_out_file_list) > 0:
|
|
174
|
+
print('The following *out.txt files are missing, program exited!')
|
|
175
|
+
print('\n'.join(sorted(missed_out_file_list)))
|
|
176
|
+
exit()
|
|
177
|
+
|
|
178
|
+
# read in y-axis label file
|
|
179
|
+
y_label_dict = dict()
|
|
180
|
+
if y_label_txt is not None:
|
|
181
|
+
for each_sample in open(y_label_txt):
|
|
182
|
+
each_sample_split = each_sample.strip().split('\t')
|
|
183
|
+
if len(each_sample_split) == 2:
|
|
184
|
+
y_label_dict[each_sample_split[0]] = each_sample_split[1]
|
|
185
|
+
else:
|
|
186
|
+
print('Format error: %s' % y_label_txt)
|
|
187
|
+
exit()
|
|
188
|
+
|
|
189
|
+
_, f_base, _ = sep_path_basename_ext(output_plot)
|
|
190
|
+
found_matched_node = False
|
|
191
|
+
op_tree_tmp = '%s_tree.txt' % f_base
|
|
192
|
+
op_df_tmp = '%s_data.txt' % f_base
|
|
193
|
+
op_label_tmp = '%s_label.txt' % f_base
|
|
194
|
+
|
|
195
|
+
op_label_tmp_handle = open(op_label_tmp, 'w')
|
|
196
|
+
op_tree_tmp_handle = open(op_tree_tmp, 'w')
|
|
197
|
+
op_df_tmp_handle = open(op_df_tmp, 'w')
|
|
198
|
+
op_df_tmp_handle.write('Value,Node,Setting\n')
|
|
199
|
+
for mcmc_file in mcmc_file_list:
|
|
200
|
+
|
|
201
|
+
mcmc_file_no_path = mcmc_file
|
|
202
|
+
if '/' in mcmc_file_no_path:
|
|
203
|
+
mcmc_file_no_path = mcmc_file_no_path.split('/')[-1]
|
|
204
|
+
|
|
205
|
+
if specified_out_file is None:
|
|
206
|
+
pwd_current_run_mcmc_out = mcmc_file.replace('_mcmc.txt', '_out.txt')
|
|
207
|
+
else:
|
|
208
|
+
pwd_current_run_mcmc_out = specified_out_file
|
|
209
|
+
node_set, node_rename_dict, tree_str = get_internal_node_to_plot(node_txt, pwd_current_run_mcmc_out)
|
|
210
|
+
op_tree_tmp_handle.write('%s\t%s\n' % (mcmc_file_no_path.replace('_mcmc.txt', ''), tree_str))
|
|
211
|
+
label_to_write = y_label_dict.get(mcmc_file_no_path, mcmc_file_no_path)
|
|
212
|
+
mcmc_df = pd.read_table(mcmc_file, index_col=0)
|
|
213
|
+
for each_col in mcmc_df:
|
|
214
|
+
if each_col in node_set:
|
|
215
|
+
node_name_to_write = node_rename_dict.get(each_col, each_col)
|
|
216
|
+
found_matched_node = True
|
|
217
|
+
value_list = mcmc_df[each_col].values
|
|
218
|
+
for each_value in value_list:
|
|
219
|
+
op_df_tmp_handle.write('%s,%s,%s\n' % (each_value, node_name_to_write, label_to_write))
|
|
220
|
+
|
|
221
|
+
op_label_tmp_handle.write('%s\t%s\t%s\n' % (label_to_write, each_col, node_name_to_write))
|
|
222
|
+
op_df_tmp_handle.close()
|
|
223
|
+
op_label_tmp_handle.close()
|
|
224
|
+
op_tree_tmp_handle.close()
|
|
225
|
+
|
|
226
|
+
if found_matched_node is False:
|
|
227
|
+
print('Provided node(s) not found, program exited!')
|
|
228
|
+
exit()
|
|
229
|
+
|
|
230
|
+
# plot distribution
|
|
231
|
+
plot_distribution(op_df_tmp, output_plot)
|
|
232
|
+
|
|
233
|
+
# remove tmp files
|
|
234
|
+
if keep_tmp_file is False:
|
|
235
|
+
os.system('rm %s' % op_tree_tmp)
|
|
236
|
+
os.system('rm %s' % op_df_tmp)
|
|
237
|
+
os.system('rm %s' % op_label_tmp)
|
|
238
|
+
|
|
239
|
+
print('Plot exported to %s, done!' % output_plot)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
if __name__ == '__main__':
|
|
243
|
+
|
|
244
|
+
PlotMcmcNode_parser = argparse.ArgumentParser()
|
|
245
|
+
PlotMcmcNode_parser.add_argument('-i', required=True, help='folder holds the *mcmc.txt and *out.txt files')
|
|
246
|
+
PlotMcmcNode_parser.add_argument('-of', required=False, default=None, help='the *out.txt file')
|
|
247
|
+
PlotMcmcNode_parser.add_argument('-n', required=True, help='Nodes to plot')
|
|
248
|
+
PlotMcmcNode_parser.add_argument('-l', required=False, default=None, help='labels on y axis')
|
|
249
|
+
PlotMcmcNode_parser.add_argument('-o', required=True, help='Output plot')
|
|
250
|
+
PlotMcmcNode_parser.add_argument('-tmp', required=False, action="store_true", help='keep tmp files')
|
|
251
|
+
args = vars(PlotMcmcNode_parser.parse_args())
|
|
252
|
+
PlotMcmcNode(args)
|
TreeSAK/RootTree.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import dendropy
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
RootTree_usage = '''
|
|
8
|
+
====================== RootTree example commands ======================
|
|
9
|
+
|
|
10
|
+
TreeSAK RootTree -i input.tree -og outgroup_genomes.txt -o rooted.tree
|
|
11
|
+
|
|
12
|
+
=======================================================================
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Reroot the tree using the given outgroup.
|
|
20
|
+
modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
|
|
21
|
+
|
|
22
|
+
input_tree: File containing Newick tree to rerooted.
|
|
23
|
+
output_tree: Name of file for rerooted tree.
|
|
24
|
+
outgroup: Labels of taxa in outgroup.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
|
|
28
|
+
|
|
29
|
+
outgroup_in_tree = set()
|
|
30
|
+
ingroup_leaves = set()
|
|
31
|
+
for n in tree.leaf_node_iter():
|
|
32
|
+
if n.taxon.label in out_group_list:
|
|
33
|
+
outgroup_in_tree.add(n.taxon)
|
|
34
|
+
else:
|
|
35
|
+
ingroup_leaves.add(n)
|
|
36
|
+
|
|
37
|
+
# Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
|
|
38
|
+
# ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
|
|
39
|
+
# outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
|
|
40
|
+
# polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
|
|
41
|
+
# lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
|
|
42
|
+
# this is not guaranteed.
|
|
43
|
+
|
|
44
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
45
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
46
|
+
while True:
|
|
47
|
+
rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
|
|
48
|
+
tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
|
|
49
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
50
|
+
if len(mrca.leaf_nodes()) == mrca_leaves:
|
|
51
|
+
break
|
|
52
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
53
|
+
|
|
54
|
+
if mrca.edge_length is not None:
|
|
55
|
+
tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
|
|
56
|
+
|
|
57
|
+
# tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
58
|
+
tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
59
|
+
tree_out_string = tree_out_string.replace("'", "")
|
|
60
|
+
|
|
61
|
+
# add the root bar
|
|
62
|
+
if add_root_branch is True:
|
|
63
|
+
tree_out_string = '(' + tree_out_string
|
|
64
|
+
tree_out_string = tree_out_string.replace(');', '):0.02);')
|
|
65
|
+
|
|
66
|
+
# write out tree string
|
|
67
|
+
tree_file_rooted_handle = open(tree_file_rooted, 'w')
|
|
68
|
+
tree_file_rooted_handle.write(tree_out_string)
|
|
69
|
+
tree_file_rooted_handle.close()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def RootTree(args):
|
|
73
|
+
|
|
74
|
+
tree_file = args['i']
|
|
75
|
+
out_group_txt = args['og']
|
|
76
|
+
tree_file_rooted = args['o']
|
|
77
|
+
tree_fmt = args['fmt']
|
|
78
|
+
add_root_branch = args['add_root']
|
|
79
|
+
|
|
80
|
+
out_group_set = set()
|
|
81
|
+
for each_og in open(out_group_txt):
|
|
82
|
+
out_group_set.add(each_og.strip())
|
|
83
|
+
|
|
84
|
+
# tre = Tree(tree_file, format=tree_fmt)
|
|
85
|
+
# out_group_lca = tre.get_common_ancestor(out_group_set)
|
|
86
|
+
# tre.set_outgroup(out_group_lca)
|
|
87
|
+
# tre.write(outfile=tree_file_rooted, format=tree_fmt)
|
|
88
|
+
|
|
89
|
+
root_with_outgroup(tree_file, out_group_set, add_root_branch, tree_file_rooted)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
if __name__ == '__main__':
|
|
93
|
+
|
|
94
|
+
RootTree_parser = argparse.ArgumentParser()
|
|
95
|
+
RootTree_parser.add_argument('-i', required=True, help='input tree')
|
|
96
|
+
RootTree_parser.add_argument('-og', required=True, help='out group leaves')
|
|
97
|
+
RootTree_parser.add_argument('-o', required=True, help='output tree')
|
|
98
|
+
RootTree_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
|
|
99
|
+
RootTree_parser.add_argument('-fmt', required=False, default=1, type=int, help='tree format, default: 1')
|
|
100
|
+
args = vars(RootTree_parser.parse_args())
|
|
101
|
+
RootTree(args)
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
import dendropy
|
|
4
|
+
import argparse
|
|
5
|
+
from ete3 import Tree
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
RootTreeGTDB214_usage = '''
|
|
9
|
+
========================================= RootTreeGTDB214 example command =========================================
|
|
10
|
+
|
|
11
|
+
TreeSAK RootTreeGTDB214 -tree ar53.unrooted.tree -tax ar53.summary.tsv -db db_dir -d ar -o ar53.rooted.tree
|
|
12
|
+
TreeSAK RootTreeGTDB214 -tree bac120.unrooted.tree -tax bac120.summary.tsv -db db_dir -d ar -o bac120.rooted.tree
|
|
13
|
+
|
|
14
|
+
# prepare GTDB database files
|
|
15
|
+
cd db_dir
|
|
16
|
+
wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/ar53_r214.tree.tar.gz
|
|
17
|
+
wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/bac120_r214.tree.tar.gz
|
|
18
|
+
wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/ar53_metadata_r214.tsv.gz
|
|
19
|
+
wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/bac120_metadata_r214.tsv.gz
|
|
20
|
+
tar -xzvf ar53_r214.tree.tar.gz
|
|
21
|
+
tar -xzvf bac120_r214.tree.tar.gz
|
|
22
|
+
gunzip ar53_metadata_r214.tsv.gz
|
|
23
|
+
gunzip bac120_metadata_r214.tsv.gz
|
|
24
|
+
|
|
25
|
+
================================================================================================================
|
|
26
|
+
'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_smallest_outgroup(tree_object):
|
|
30
|
+
|
|
31
|
+
min_outgroup_leaf_num = 99999
|
|
32
|
+
for each_root_child in tree_object.children:
|
|
33
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
34
|
+
if len(leaf_list) < min_outgroup_leaf_num:
|
|
35
|
+
min_outgroup_leaf_num = len(leaf_list)
|
|
36
|
+
|
|
37
|
+
out_group_leaf_list = []
|
|
38
|
+
for each_root_child in tree_object.children:
|
|
39
|
+
leaf_list = each_root_child.get_leaf_names()
|
|
40
|
+
if len(leaf_list) == min_outgroup_leaf_num:
|
|
41
|
+
out_group_leaf_list = leaf_list
|
|
42
|
+
|
|
43
|
+
return out_group_leaf_list
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def sep_taxon_str(taxon_string):
|
|
47
|
+
|
|
48
|
+
taxon_string_split = taxon_string.strip().split(';')
|
|
49
|
+
taxon_p = taxon_string_split[1]
|
|
50
|
+
taxon_c = taxon_string_split[2]
|
|
51
|
+
taxon_o = taxon_string_split[3]
|
|
52
|
+
taxon_f = taxon_string_split[4]
|
|
53
|
+
taxon_g = taxon_string_split[5]
|
|
54
|
+
|
|
55
|
+
return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
|
|
59
|
+
|
|
60
|
+
input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
|
|
61
|
+
|
|
62
|
+
# subset tree
|
|
63
|
+
subset_tree = input_tree.copy()
|
|
64
|
+
subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
|
|
65
|
+
|
|
66
|
+
# rename leaf
|
|
67
|
+
for each_leaf in subset_tree:
|
|
68
|
+
leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
|
|
69
|
+
each_leaf.name = leaf_name_new
|
|
70
|
+
|
|
71
|
+
return subset_tree
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def root_with_outgroup(input_tree, out_group_list, tree_file_rooted):
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
Reroot the tree using the given outgroup.
|
|
78
|
+
modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
|
|
79
|
+
|
|
80
|
+
input_tree: File containing Newick tree to rerooted.
|
|
81
|
+
output_tree: Name of file for rerooted tree.
|
|
82
|
+
outgroup: Labels of taxa in outgroup.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
|
|
86
|
+
|
|
87
|
+
outgroup_in_tree = set()
|
|
88
|
+
ingroup_leaves = set()
|
|
89
|
+
for n in tree.leaf_node_iter():
|
|
90
|
+
if n.taxon.label in out_group_list:
|
|
91
|
+
outgroup_in_tree.add(n.taxon)
|
|
92
|
+
else:
|
|
93
|
+
ingroup_leaves.add(n)
|
|
94
|
+
|
|
95
|
+
# Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
|
|
96
|
+
# ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
|
|
97
|
+
# outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
|
|
98
|
+
# polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
|
|
99
|
+
# lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
|
|
100
|
+
# this is not guaranteed.
|
|
101
|
+
|
|
102
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
103
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
104
|
+
while True:
|
|
105
|
+
rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
|
|
106
|
+
tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
|
|
107
|
+
mrca = tree.mrca(taxa=outgroup_in_tree)
|
|
108
|
+
if len(mrca.leaf_nodes()) == mrca_leaves:
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
mrca_leaves = len(mrca.leaf_nodes())
|
|
112
|
+
|
|
113
|
+
if mrca.edge_length is not None:
|
|
114
|
+
tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
|
|
115
|
+
tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def RootTreeGTDB214(args):
|
|
119
|
+
|
|
120
|
+
input_unrooted_tree = args['tree']
|
|
121
|
+
user_gnm_taxon = args['tax']
|
|
122
|
+
db_dir = args['db']
|
|
123
|
+
gnm_domain = args['d']
|
|
124
|
+
rooted_tree = args['o']
|
|
125
|
+
|
|
126
|
+
# define file name
|
|
127
|
+
gtdb_ref_tree_ar = '%s/ar53_r214.tree' % db_dir
|
|
128
|
+
gtdb_ref_tree_bac = '%s/bac120_r214.tree' % db_dir
|
|
129
|
+
gtdb_gnm_meta_ar = '%s/ar53_metadata_r214.tsv' % db_dir
|
|
130
|
+
gtdb_gnm_meta_bac = '%s/bac120_metadata_r214.tsv' % db_dir
|
|
131
|
+
|
|
132
|
+
if gnm_domain == 'bac':
|
|
133
|
+
gtdb_ref_tree = gtdb_ref_tree_bac
|
|
134
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_bac
|
|
135
|
+
elif gnm_domain == 'ar':
|
|
136
|
+
gtdb_ref_tree = gtdb_ref_tree_ar
|
|
137
|
+
gtdb_gnm_metadata = gtdb_gnm_meta_ar
|
|
138
|
+
else:
|
|
139
|
+
print('please provide either "ar" or "bac" to -d')
|
|
140
|
+
exit()
|
|
141
|
+
|
|
142
|
+
tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
|
|
143
|
+
ref_tree_gnm_list = tree.get_leaf_names()
|
|
144
|
+
ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
|
|
145
|
+
|
|
146
|
+
# read in user_gnm_taxon
|
|
147
|
+
user_gnm_taxon_dict_p = dict()
|
|
148
|
+
user_gnm_taxon_dict_c = dict()
|
|
149
|
+
user_gnm_taxon_dict_o = dict()
|
|
150
|
+
user_gnm_taxon_dict_f = dict()
|
|
151
|
+
user_gnm_taxon_dict_g = dict()
|
|
152
|
+
for each_gnm in open(user_gnm_taxon):
|
|
153
|
+
if not each_gnm.startswith('user_genome\t'):
|
|
154
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
155
|
+
gnm_id = each_gnm_split[0]
|
|
156
|
+
gnm_taxon = each_gnm_split[1]
|
|
157
|
+
|
|
158
|
+
count_current_gnm = False
|
|
159
|
+
if gnm_domain == 'bac':
|
|
160
|
+
if 'd__Bacteria' in gnm_taxon:
|
|
161
|
+
count_current_gnm = True
|
|
162
|
+
elif gnm_domain == 'ar':
|
|
163
|
+
if 'd__Archaea' in gnm_taxon:
|
|
164
|
+
count_current_gnm = True
|
|
165
|
+
|
|
166
|
+
if count_current_gnm is True:
|
|
167
|
+
|
|
168
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
|
|
169
|
+
|
|
170
|
+
if gnm_p not in user_gnm_taxon_dict_p:
|
|
171
|
+
user_gnm_taxon_dict_p[gnm_p] = set()
|
|
172
|
+
if gnm_c not in user_gnm_taxon_dict_c:
|
|
173
|
+
user_gnm_taxon_dict_c[gnm_c] = set()
|
|
174
|
+
if gnm_o not in user_gnm_taxon_dict_o:
|
|
175
|
+
user_gnm_taxon_dict_o[gnm_o] = set()
|
|
176
|
+
if gnm_f not in user_gnm_taxon_dict_f:
|
|
177
|
+
user_gnm_taxon_dict_f[gnm_f] = set()
|
|
178
|
+
if gnm_g not in user_gnm_taxon_dict_g:
|
|
179
|
+
user_gnm_taxon_dict_g[gnm_g] = set()
|
|
180
|
+
|
|
181
|
+
user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
|
|
182
|
+
user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
|
|
183
|
+
user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
|
|
184
|
+
user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
|
|
185
|
+
user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
|
|
186
|
+
|
|
187
|
+
# determine rooting rank, start from phylum
|
|
188
|
+
rooting_rank = ''
|
|
189
|
+
rooting_rank_taxon_dict = dict()
|
|
190
|
+
if len(user_gnm_taxon_dict_p) > 1:
|
|
191
|
+
rooting_rank = 'p'
|
|
192
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_p
|
|
193
|
+
elif len(user_gnm_taxon_dict_c) > 1:
|
|
194
|
+
rooting_rank = 'c'
|
|
195
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_c
|
|
196
|
+
elif len(user_gnm_taxon_dict_o) > 1:
|
|
197
|
+
rooting_rank = 'o'
|
|
198
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_o
|
|
199
|
+
elif len(user_gnm_taxon_dict_f) > 1:
|
|
200
|
+
rooting_rank = 'f'
|
|
201
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_f
|
|
202
|
+
elif len(user_gnm_taxon_dict_g) > 1:
|
|
203
|
+
rooting_rank = 'g'
|
|
204
|
+
rooting_rank_taxon_dict = user_gnm_taxon_dict_g
|
|
205
|
+
|
|
206
|
+
if rooting_rank == '':
|
|
207
|
+
print('All user genomes are from the same genus, program exited!')
|
|
208
|
+
exit()
|
|
209
|
+
|
|
210
|
+
col_index = {}
|
|
211
|
+
canditate_gnms_rooting_rank = dict()
|
|
212
|
+
counted_taxons_rooting_rank = set()
|
|
213
|
+
for each_ref in open(gtdb_gnm_metadata):
|
|
214
|
+
each_ref_split = each_ref.strip().split('\t')
|
|
215
|
+
if each_ref.startswith('accession ambiguous_bases'):
|
|
216
|
+
col_index = {key: i for i, key in enumerate(each_ref_split)}
|
|
217
|
+
else:
|
|
218
|
+
ref_accession = each_ref_split[0]
|
|
219
|
+
gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
|
|
220
|
+
if ref_accession in ref_tree_gnm_set:
|
|
221
|
+
gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
|
|
222
|
+
|
|
223
|
+
gnm_rooting_rank = ''
|
|
224
|
+
if rooting_rank == 'p':
|
|
225
|
+
gnm_rooting_rank = gnm_p
|
|
226
|
+
elif rooting_rank == 'c':
|
|
227
|
+
gnm_rooting_rank = gnm_c
|
|
228
|
+
elif rooting_rank == 'o':
|
|
229
|
+
gnm_rooting_rank = gnm_o
|
|
230
|
+
elif rooting_rank == 'f':
|
|
231
|
+
gnm_rooting_rank = gnm_f
|
|
232
|
+
elif rooting_rank == 'g':
|
|
233
|
+
gnm_rooting_rank = gnm_g
|
|
234
|
+
|
|
235
|
+
# rooting_rank
|
|
236
|
+
if gnm_rooting_rank in rooting_rank_taxon_dict:
|
|
237
|
+
if gnm_rooting_rank not in counted_taxons_rooting_rank:
|
|
238
|
+
counted_taxons_rooting_rank.add(gnm_rooting_rank)
|
|
239
|
+
canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
|
|
240
|
+
|
|
241
|
+
ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
|
|
242
|
+
|
|
243
|
+
# get the smallest out group taxon set
|
|
244
|
+
smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
|
|
245
|
+
|
|
246
|
+
user_gnm_taxon_dict_rooting_rank = dict()
|
|
247
|
+
if rooting_rank == 'p':
|
|
248
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
|
|
249
|
+
elif rooting_rank == 'c':
|
|
250
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
|
|
251
|
+
elif rooting_rank == 'o':
|
|
252
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
|
|
253
|
+
elif rooting_rank == 'f':
|
|
254
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
|
|
255
|
+
elif rooting_rank == 'g':
|
|
256
|
+
user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
|
|
257
|
+
|
|
258
|
+
# get the smallest out group genome set
|
|
259
|
+
out_group_gnm_set_1 = set()
|
|
260
|
+
out_group_gnm_set_2 = set()
|
|
261
|
+
for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
|
|
262
|
+
gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
|
|
263
|
+
if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
|
|
264
|
+
out_group_gnm_set_1.update(gnm_member_set)
|
|
265
|
+
else:
|
|
266
|
+
out_group_gnm_set_2.update(gnm_member_set)
|
|
267
|
+
|
|
268
|
+
# select the smaller one as outgroup
|
|
269
|
+
if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
|
|
270
|
+
out_group_gnm_set = out_group_gnm_set_1
|
|
271
|
+
else:
|
|
272
|
+
out_group_gnm_set = out_group_gnm_set_2
|
|
273
|
+
|
|
274
|
+
# root user tree with identified out group genomes
|
|
275
|
+
root_with_outgroup(input_unrooted_tree, out_group_gnm_set, rooted_tree)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
if __name__ == '__main__':
|
|
279
|
+
|
|
280
|
+
RootTreeGTDB214_parser = argparse.ArgumentParser(usage=RootTreeGTDB214_usage)
|
|
281
|
+
RootTreeGTDB214_parser.add_argument('-tree', required=True, help='input unrooted tree')
|
|
282
|
+
RootTreeGTDB214_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
|
|
283
|
+
RootTreeGTDB214_parser.add_argument('-db', required=True, help='GTDB database files')
|
|
284
|
+
RootTreeGTDB214_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
|
|
285
|
+
RootTreeGTDB214_parser.add_argument('-o', required=True, help='output folder')
|
|
286
|
+
args = vars(RootTreeGTDB214_parser.parse_args())
|
|
287
|
+
RootTreeGTDB214(args)
|
|
288
|
+
|