treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,252 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ import pandas as pd
5
+ from ete3 import Tree
6
+ import plotly.express as px
7
+
8
+
9
+ PlotMcmcNode_usage = '''
10
+ ============================ PlotMcmcNode example commands ============================
11
+
12
+ TreeSAK PlotMcmcNode -i McmcTree_op_files -n n179 -o Clock2_n179.pdf
13
+ TreeSAK PlotMcmcNode -i McmcTree_op_files -n n161,n186 -o Clock3_n161_n186.pdf
14
+ TreeSAK PlotMcmcNode -i McmcTree_op_files -n nodes.txt -o multi_runs_multi_nodes.pdf
15
+
16
+ # File name of the mcmc.txt and the corresponding mcmc out file need to follow
17
+ # the rule as specified below:
18
+ [setting_1]_mcmc.txt
19
+ [setting_1]_out.txt
20
+ [setting_2]_mcmc.txt
21
+ [setting_2]_out.txt
22
+
23
+ # file format (-n, tab separated)
24
+ # leave the 2nd column blank for nodes without renaming
25
+ setting_1 node1 Bacteria
26
+ setting_2 node2
27
+ setting_3 node3,node9 Archaea
28
+
29
+ # Y-axis label file format (-l, tab separated)
30
+ PA_75_DeltaLL_50_clock3_mcmc.txt DeltaLL_50
31
+ PA_75_DeltaLL_75_clock3_mcmc.txt DeltaLL_75
32
+
33
+ =======================================================================================
34
+ '''
35
+
36
+ def sep_path_basename_ext(file_in):
37
+
38
+ # separate path and file name
39
+ f_path, file_name = os.path.split(file_in)
40
+ if f_path == '':
41
+ f_path = '.'
42
+
43
+ # separate file basename and extension
44
+ f_base, f_ext = os.path.splitext(file_name)
45
+
46
+ return f_path, f_base, f_ext
47
+
48
+
49
+ def mcmctree_out_to_tree_str(mamctree_out):
50
+
51
+ # get tree string from mcmctree_out
52
+ tree_str = ''
53
+ tree_line = 0
54
+ current_line = 1
55
+ for each_line in open(mamctree_out):
56
+ if 'Species tree for FigTree. Branch lengths = posterior mean times; 95% CIs = labels' in each_line:
57
+ tree_line = current_line + 1
58
+ if tree_line == current_line:
59
+ tree_str = each_line.strip()
60
+ current_line += 1
61
+
62
+ tree_str_no_space = tree_str.replace(' ', '')
63
+
64
+ # rename tree nodes
65
+ t = Tree(tree_str_no_space, format=1)
66
+ for each_node in t.traverse():
67
+ if each_node.is_leaf():
68
+ node_name_new = '_'.join(each_node.name.split('_')[1:])
69
+ else:
70
+ node_name_new = 't_n%s' % each_node.name
71
+ each_node.name = node_name_new
72
+
73
+ tree_str_renamed = t.write(format=8)
74
+
75
+ return tree_str_renamed
76
+
77
+
78
+ def plot_distribution(df_txt, output_plot):
79
+
80
+ df = pd.read_table(df_txt, sep=',')
81
+ run_id_list = df['Setting'].unique()
82
+ node_id_list = df['Node'].unique()
83
+
84
+ # sort dataframe by run id
85
+ df = df.sort_values(by='Setting', ascending=False)
86
+
87
+ plot_width = 900
88
+ plot_height = len(run_id_list)*100
89
+ if plot_height < 360:
90
+ plot_height = 360
91
+
92
+ fig = px.violin(df, x="Value", y="Setting", color="Node", points=False, orientation="h", width=plot_width, height=plot_height)
93
+ if len(node_id_list) == 1:
94
+ fig.update_traces(side="positive", fillcolor='lightblue', width=1.6, opacity=0.75)
95
+ else:
96
+ fig.update_traces(side="positive", fillcolor='rgba(0,0,0,0)', width=1.6)
97
+
98
+ fig.update_traces(showlegend=True)
99
+ fig.layout.template = "simple_white"
100
+ # fig.layout.width = 700
101
+ # fig.layout.height = 750
102
+ # fig.update_xaxes(range=[40, 0])
103
+ # fig.update_layout(margin_t=10, title_text='Demo', title_x=0.5)
104
+ fig.write_image(output_plot)
105
+
106
+
107
+ def get_internal_node_to_plot(node_txt, mo_file):
108
+
109
+ tree_str = ''
110
+ if os.path.isfile(mo_file):
111
+ tree_str = mcmctree_out_to_tree_str(mo_file)
112
+
113
+ # get nodes to plot
114
+ node_set = set()
115
+ node_rename_dict = dict()
116
+ if os.path.isfile(node_txt) is True:
117
+ for each in open(node_txt):
118
+ each_split = each.strip().split('\t')
119
+ node_str = each_split[0]
120
+
121
+ # get internal_node_to_plot
122
+ internal_node_to_plot = ''
123
+ if ',' not in node_str:
124
+ internal_node_to_plot = each_split[0]
125
+ else:
126
+ leaf_list = node_str.split(',')
127
+ if tree_str == '':
128
+ print('*out.txt file not found, program exited!')
129
+ exit()
130
+ current_lca = Tree(tree_str, format=1).get_common_ancestor(leaf_list)
131
+ internal_node_to_plot = current_lca.name
132
+
133
+ # add internal_node_to_plot to node_set
134
+ if internal_node_to_plot != '':
135
+ node_set.add(internal_node_to_plot)
136
+
137
+ # read in name to show in plot
138
+ if len(each_split) == 2:
139
+ if each_split[1] != '':
140
+ node_rename_dict[internal_node_to_plot] = each_split[1]
141
+ else:
142
+ node_set = node_txt.split(',')
143
+
144
+ return node_set, node_rename_dict, tree_str
145
+
146
+
147
+ def PlotMcmcNode(args):
148
+
149
+ mcmc_in = args['i']
150
+ node_txt = args['n']
151
+ output_plot = args['o']
152
+ specified_out_file = args['of']
153
+ y_label_txt = args['l']
154
+ keep_tmp_file = args['tmp']
155
+
156
+ # check MCMCTree output file/dir
157
+ if os.path.isfile(mcmc_in) is True:
158
+ mcmc_file_list = [mcmc_in]
159
+ else:
160
+ mcmc_file_re = '%s/*_mcmc.txt' % (mcmc_in)
161
+ mcmc_file_list = glob.glob(mcmc_file_re)
162
+
163
+ if len(mcmc_file_list) == 0:
164
+ print('*mcmc.txt file not found, program exited!')
165
+ exit()
166
+
167
+ if specified_out_file is None:
168
+ missed_out_file_list = []
169
+ for each_mcmc_file in mcmc_file_list:
170
+ pwd_out_file = each_mcmc_file.replace('_mcmc.txt', '_out.txt')
171
+ if os.path.isfile(pwd_out_file) is False:
172
+ missed_out_file_list.append(pwd_out_file)
173
+ if len(missed_out_file_list) > 0:
174
+ print('The following *out.txt files are missing, program exited!')
175
+ print('\n'.join(sorted(missed_out_file_list)))
176
+ exit()
177
+
178
+ # read in y-axis label file
179
+ y_label_dict = dict()
180
+ if y_label_txt is not None:
181
+ for each_sample in open(y_label_txt):
182
+ each_sample_split = each_sample.strip().split('\t')
183
+ if len(each_sample_split) == 2:
184
+ y_label_dict[each_sample_split[0]] = each_sample_split[1]
185
+ else:
186
+ print('Format error: %s' % y_label_txt)
187
+ exit()
188
+
189
+ _, f_base, _ = sep_path_basename_ext(output_plot)
190
+ found_matched_node = False
191
+ op_tree_tmp = '%s_tree.txt' % f_base
192
+ op_df_tmp = '%s_data.txt' % f_base
193
+ op_label_tmp = '%s_label.txt' % f_base
194
+
195
+ op_label_tmp_handle = open(op_label_tmp, 'w')
196
+ op_tree_tmp_handle = open(op_tree_tmp, 'w')
197
+ op_df_tmp_handle = open(op_df_tmp, 'w')
198
+ op_df_tmp_handle.write('Value,Node,Setting\n')
199
+ for mcmc_file in mcmc_file_list:
200
+
201
+ mcmc_file_no_path = mcmc_file
202
+ if '/' in mcmc_file_no_path:
203
+ mcmc_file_no_path = mcmc_file_no_path.split('/')[-1]
204
+
205
+ if specified_out_file is None:
206
+ pwd_current_run_mcmc_out = mcmc_file.replace('_mcmc.txt', '_out.txt')
207
+ else:
208
+ pwd_current_run_mcmc_out = specified_out_file
209
+ node_set, node_rename_dict, tree_str = get_internal_node_to_plot(node_txt, pwd_current_run_mcmc_out)
210
+ op_tree_tmp_handle.write('%s\t%s\n' % (mcmc_file_no_path.replace('_mcmc.txt', ''), tree_str))
211
+ label_to_write = y_label_dict.get(mcmc_file_no_path, mcmc_file_no_path)
212
+ mcmc_df = pd.read_table(mcmc_file, index_col=0)
213
+ for each_col in mcmc_df:
214
+ if each_col in node_set:
215
+ node_name_to_write = node_rename_dict.get(each_col, each_col)
216
+ found_matched_node = True
217
+ value_list = mcmc_df[each_col].values
218
+ for each_value in value_list:
219
+ op_df_tmp_handle.write('%s,%s,%s\n' % (each_value, node_name_to_write, label_to_write))
220
+
221
+ op_label_tmp_handle.write('%s\t%s\t%s\n' % (label_to_write, each_col, node_name_to_write))
222
+ op_df_tmp_handle.close()
223
+ op_label_tmp_handle.close()
224
+ op_tree_tmp_handle.close()
225
+
226
+ if found_matched_node is False:
227
+ print('Provided node(s) not found, program exited!')
228
+ exit()
229
+
230
+ # plot distribution
231
+ plot_distribution(op_df_tmp, output_plot)
232
+
233
+ # remove tmp files
234
+ if keep_tmp_file is False:
235
+ os.system('rm %s' % op_tree_tmp)
236
+ os.system('rm %s' % op_df_tmp)
237
+ os.system('rm %s' % op_label_tmp)
238
+
239
+ print('Plot exported to %s, done!' % output_plot)
240
+
241
+
242
+ if __name__ == '__main__':
243
+
244
+ PlotMcmcNode_parser = argparse.ArgumentParser()
245
+ PlotMcmcNode_parser.add_argument('-i', required=True, help='folder holds the *mcmc.txt and *out.txt files')
246
+ PlotMcmcNode_parser.add_argument('-of', required=False, default=None, help='the *out.txt file')
247
+ PlotMcmcNode_parser.add_argument('-n', required=True, help='Nodes to plot')
248
+ PlotMcmcNode_parser.add_argument('-l', required=False, default=None, help='labels on y axis')
249
+ PlotMcmcNode_parser.add_argument('-o', required=True, help='Output plot')
250
+ PlotMcmcNode_parser.add_argument('-tmp', required=False, action="store_true", help='keep tmp files')
251
+ args = vars(PlotMcmcNode_parser.parse_args())
252
+ PlotMcmcNode(args)
TreeSAK/RootTree.py ADDED
@@ -0,0 +1,101 @@
1
+ import random
2
+ import dendropy
3
+ import argparse
4
+ from ete3 import Tree
5
+
6
+
7
+ RootTree_usage = '''
8
+ ====================== RootTree example commands ======================
9
+
10
+ TreeSAK RootTree -i input.tree -og outgroup_genomes.txt -o rooted.tree
11
+
12
+ =======================================================================
13
+ '''
14
+
15
+
16
+ def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
17
+
18
+ """
19
+ Reroot the tree using the given outgroup.
20
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
21
+
22
+ input_tree: File containing Newick tree to rerooted.
23
+ output_tree: Name of file for rerooted tree.
24
+ outgroup: Labels of taxa in outgroup.
25
+ """
26
+
27
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
28
+
29
+ outgroup_in_tree = set()
30
+ ingroup_leaves = set()
31
+ for n in tree.leaf_node_iter():
32
+ if n.taxon.label in out_group_list:
33
+ outgroup_in_tree.add(n.taxon)
34
+ else:
35
+ ingroup_leaves.add(n)
36
+
37
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
38
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
39
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
40
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
41
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
42
+ # this is not guaranteed.
43
+
44
+ mrca = tree.mrca(taxa=outgroup_in_tree)
45
+ mrca_leaves = len(mrca.leaf_nodes())
46
+ while True:
47
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
48
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
49
+ mrca = tree.mrca(taxa=outgroup_in_tree)
50
+ if len(mrca.leaf_nodes()) == mrca_leaves:
51
+ break
52
+ mrca_leaves = len(mrca.leaf_nodes())
53
+
54
+ if mrca.edge_length is not None:
55
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
56
+
57
+ # tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
58
+ tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
59
+ tree_out_string = tree_out_string.replace("'", "")
60
+
61
+ # add the root bar
62
+ if add_root_branch is True:
63
+ tree_out_string = '(' + tree_out_string
64
+ tree_out_string = tree_out_string.replace(');', '):0.02);')
65
+
66
+ # write out tree string
67
+ tree_file_rooted_handle = open(tree_file_rooted, 'w')
68
+ tree_file_rooted_handle.write(tree_out_string)
69
+ tree_file_rooted_handle.close()
70
+
71
+
72
+ def RootTree(args):
73
+
74
+ tree_file = args['i']
75
+ out_group_txt = args['og']
76
+ tree_file_rooted = args['o']
77
+ tree_fmt = args['fmt']
78
+ add_root_branch = args['add_root']
79
+
80
+ out_group_set = set()
81
+ for each_og in open(out_group_txt):
82
+ out_group_set.add(each_og.strip())
83
+
84
+ # tre = Tree(tree_file, format=tree_fmt)
85
+ # out_group_lca = tre.get_common_ancestor(out_group_set)
86
+ # tre.set_outgroup(out_group_lca)
87
+ # tre.write(outfile=tree_file_rooted, format=tree_fmt)
88
+
89
+ root_with_outgroup(tree_file, out_group_set, add_root_branch, tree_file_rooted)
90
+
91
+
92
+ if __name__ == '__main__':
93
+
94
+ RootTree_parser = argparse.ArgumentParser()
95
+ RootTree_parser.add_argument('-i', required=True, help='input tree')
96
+ RootTree_parser.add_argument('-og', required=True, help='out group leaves')
97
+ RootTree_parser.add_argument('-o', required=True, help='output tree')
98
+ RootTree_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
99
+ RootTree_parser.add_argument('-fmt', required=False, default=1, type=int, help='tree format, default: 1')
100
+ args = vars(RootTree_parser.parse_args())
101
+ RootTree(args)
@@ -0,0 +1,288 @@
1
+ import os
2
+ import random
3
+ import dendropy
4
+ import argparse
5
+ from ete3 import Tree
6
+
7
+
8
+ RootTreeGTDB214_usage = '''
9
+ ========================================= RootTreeGTDB214 example command =========================================
10
+
11
+ TreeSAK RootTreeGTDB214 -tree ar53.unrooted.tree -tax ar53.summary.tsv -db db_dir -d ar -o ar53.rooted.tree
12
+ TreeSAK RootTreeGTDB214 -tree bac120.unrooted.tree -tax bac120.summary.tsv -db db_dir -d ar -o bac120.rooted.tree
13
+
14
+ # prepare GTDB database files
15
+ cd db_dir
16
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/ar53_r214.tree.tar.gz
17
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/bac120_r214.tree.tar.gz
18
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/ar53_metadata_r214.tsv.gz
19
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/bac120_metadata_r214.tsv.gz
20
+ tar -xzvf ar53_r214.tree.tar.gz
21
+ tar -xzvf bac120_r214.tree.tar.gz
22
+ gunzip ar53_metadata_r214.tsv.gz
23
+ gunzip bac120_metadata_r214.tsv.gz
24
+
25
+ ================================================================================================================
26
+ '''
27
+
28
+
29
+ def get_smallest_outgroup(tree_object):
30
+
31
+ min_outgroup_leaf_num = 99999
32
+ for each_root_child in tree_object.children:
33
+ leaf_list = each_root_child.get_leaf_names()
34
+ if len(leaf_list) < min_outgroup_leaf_num:
35
+ min_outgroup_leaf_num = len(leaf_list)
36
+
37
+ out_group_leaf_list = []
38
+ for each_root_child in tree_object.children:
39
+ leaf_list = each_root_child.get_leaf_names()
40
+ if len(leaf_list) == min_outgroup_leaf_num:
41
+ out_group_leaf_list = leaf_list
42
+
43
+ return out_group_leaf_list
44
+
45
+
46
+ def sep_taxon_str(taxon_string):
47
+
48
+ taxon_string_split = taxon_string.strip().split(';')
49
+ taxon_p = taxon_string_split[1]
50
+ taxon_c = taxon_string_split[2]
51
+ taxon_o = taxon_string_split[3]
52
+ taxon_f = taxon_string_split[4]
53
+ taxon_g = taxon_string_split[5]
54
+
55
+ return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
56
+
57
+
58
+ def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
59
+
60
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
61
+
62
+ # subset tree
63
+ subset_tree = input_tree.copy()
64
+ subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
65
+
66
+ # rename leaf
67
+ for each_leaf in subset_tree:
68
+ leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
69
+ each_leaf.name = leaf_name_new
70
+
71
+ return subset_tree
72
+
73
+
74
+ def root_with_outgroup(input_tree, out_group_list, tree_file_rooted):
75
+
76
+ """
77
+ Reroot the tree using the given outgroup.
78
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
79
+
80
+ input_tree: File containing Newick tree to rerooted.
81
+ output_tree: Name of file for rerooted tree.
82
+ outgroup: Labels of taxa in outgroup.
83
+ """
84
+
85
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
86
+
87
+ outgroup_in_tree = set()
88
+ ingroup_leaves = set()
89
+ for n in tree.leaf_node_iter():
90
+ if n.taxon.label in out_group_list:
91
+ outgroup_in_tree.add(n.taxon)
92
+ else:
93
+ ingroup_leaves.add(n)
94
+
95
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
96
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
97
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
98
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
99
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
100
+ # this is not guaranteed.
101
+
102
+ mrca = tree.mrca(taxa=outgroup_in_tree)
103
+ mrca_leaves = len(mrca.leaf_nodes())
104
+ while True:
105
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
106
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
107
+ mrca = tree.mrca(taxa=outgroup_in_tree)
108
+ if len(mrca.leaf_nodes()) == mrca_leaves:
109
+ break
110
+
111
+ mrca_leaves = len(mrca.leaf_nodes())
112
+
113
+ if mrca.edge_length is not None:
114
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
115
+ tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
116
+
117
+
118
+ def RootTreeGTDB214(args):
119
+
120
+ input_unrooted_tree = args['tree']
121
+ user_gnm_taxon = args['tax']
122
+ db_dir = args['db']
123
+ gnm_domain = args['d']
124
+ rooted_tree = args['o']
125
+
126
+ # define file name
127
+ gtdb_ref_tree_ar = '%s/ar53_r214.tree' % db_dir
128
+ gtdb_ref_tree_bac = '%s/bac120_r214.tree' % db_dir
129
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r214.tsv' % db_dir
130
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r214.tsv' % db_dir
131
+
132
+ if gnm_domain == 'bac':
133
+ gtdb_ref_tree = gtdb_ref_tree_bac
134
+ gtdb_gnm_metadata = gtdb_gnm_meta_bac
135
+ elif gnm_domain == 'ar':
136
+ gtdb_ref_tree = gtdb_ref_tree_ar
137
+ gtdb_gnm_metadata = gtdb_gnm_meta_ar
138
+ else:
139
+ print('please provide either "ar" or "bac" to -d')
140
+ exit()
141
+
142
+ tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
143
+ ref_tree_gnm_list = tree.get_leaf_names()
144
+ ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
145
+
146
+ # read in user_gnm_taxon
147
+ user_gnm_taxon_dict_p = dict()
148
+ user_gnm_taxon_dict_c = dict()
149
+ user_gnm_taxon_dict_o = dict()
150
+ user_gnm_taxon_dict_f = dict()
151
+ user_gnm_taxon_dict_g = dict()
152
+ for each_gnm in open(user_gnm_taxon):
153
+ if not each_gnm.startswith('user_genome\t'):
154
+ each_gnm_split = each_gnm.strip().split('\t')
155
+ gnm_id = each_gnm_split[0]
156
+ gnm_taxon = each_gnm_split[1]
157
+
158
+ count_current_gnm = False
159
+ if gnm_domain == 'bac':
160
+ if 'd__Bacteria' in gnm_taxon:
161
+ count_current_gnm = True
162
+ elif gnm_domain == 'ar':
163
+ if 'd__Archaea' in gnm_taxon:
164
+ count_current_gnm = True
165
+
166
+ if count_current_gnm is True:
167
+
168
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
169
+
170
+ if gnm_p not in user_gnm_taxon_dict_p:
171
+ user_gnm_taxon_dict_p[gnm_p] = set()
172
+ if gnm_c not in user_gnm_taxon_dict_c:
173
+ user_gnm_taxon_dict_c[gnm_c] = set()
174
+ if gnm_o not in user_gnm_taxon_dict_o:
175
+ user_gnm_taxon_dict_o[gnm_o] = set()
176
+ if gnm_f not in user_gnm_taxon_dict_f:
177
+ user_gnm_taxon_dict_f[gnm_f] = set()
178
+ if gnm_g not in user_gnm_taxon_dict_g:
179
+ user_gnm_taxon_dict_g[gnm_g] = set()
180
+
181
+ user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
182
+ user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
183
+ user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
184
+ user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
185
+ user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
186
+
187
+ # determine rooting rank, start from phylum
188
+ rooting_rank = ''
189
+ rooting_rank_taxon_dict = dict()
190
+ if len(user_gnm_taxon_dict_p) > 1:
191
+ rooting_rank = 'p'
192
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_p
193
+ elif len(user_gnm_taxon_dict_c) > 1:
194
+ rooting_rank = 'c'
195
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_c
196
+ elif len(user_gnm_taxon_dict_o) > 1:
197
+ rooting_rank = 'o'
198
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_o
199
+ elif len(user_gnm_taxon_dict_f) > 1:
200
+ rooting_rank = 'f'
201
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_f
202
+ elif len(user_gnm_taxon_dict_g) > 1:
203
+ rooting_rank = 'g'
204
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_g
205
+
206
+ if rooting_rank == '':
207
+ print('All user genomes are from the same genus, program exited!')
208
+ exit()
209
+
210
+ col_index = {}
211
+ canditate_gnms_rooting_rank = dict()
212
+ counted_taxons_rooting_rank = set()
213
+ for each_ref in open(gtdb_gnm_metadata):
214
+ each_ref_split = each_ref.strip().split('\t')
215
+ if each_ref.startswith('accession ambiguous_bases'):
216
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
217
+ else:
218
+ ref_accession = each_ref_split[0]
219
+ gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
220
+ if ref_accession in ref_tree_gnm_set:
221
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
222
+
223
+ gnm_rooting_rank = ''
224
+ if rooting_rank == 'p':
225
+ gnm_rooting_rank = gnm_p
226
+ elif rooting_rank == 'c':
227
+ gnm_rooting_rank = gnm_c
228
+ elif rooting_rank == 'o':
229
+ gnm_rooting_rank = gnm_o
230
+ elif rooting_rank == 'f':
231
+ gnm_rooting_rank = gnm_f
232
+ elif rooting_rank == 'g':
233
+ gnm_rooting_rank = gnm_g
234
+
235
+ # rooting_rank
236
+ if gnm_rooting_rank in rooting_rank_taxon_dict:
237
+ if gnm_rooting_rank not in counted_taxons_rooting_rank:
238
+ counted_taxons_rooting_rank.add(gnm_rooting_rank)
239
+ canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
240
+
241
+ ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
242
+
243
+ # get the smallest out group taxon set
244
+ smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
245
+
246
+ user_gnm_taxon_dict_rooting_rank = dict()
247
+ if rooting_rank == 'p':
248
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
249
+ elif rooting_rank == 'c':
250
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
251
+ elif rooting_rank == 'o':
252
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
253
+ elif rooting_rank == 'f':
254
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
255
+ elif rooting_rank == 'g':
256
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
257
+
258
+ # get the smallest out group genome set
259
+ out_group_gnm_set_1 = set()
260
+ out_group_gnm_set_2 = set()
261
+ for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
262
+ gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
263
+ if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
264
+ out_group_gnm_set_1.update(gnm_member_set)
265
+ else:
266
+ out_group_gnm_set_2.update(gnm_member_set)
267
+
268
+ # select the smaller one as outgroup
269
+ if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
270
+ out_group_gnm_set = out_group_gnm_set_1
271
+ else:
272
+ out_group_gnm_set = out_group_gnm_set_2
273
+
274
+ # root user tree with identified out group genomes
275
+ root_with_outgroup(input_unrooted_tree, out_group_gnm_set, rooted_tree)
276
+
277
+
278
+ if __name__ == '__main__':
279
+
280
+ RootTreeGTDB214_parser = argparse.ArgumentParser(usage=RootTreeGTDB214_usage)
281
+ RootTreeGTDB214_parser.add_argument('-tree', required=True, help='input unrooted tree')
282
+ RootTreeGTDB214_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
283
+ RootTreeGTDB214_parser.add_argument('-db', required=True, help='GTDB database files')
284
+ RootTreeGTDB214_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
285
+ RootTreeGTDB214_parser.add_argument('-o', required=True, help='output folder')
286
+ args = vars(RootTreeGTDB214_parser.parse_args())
287
+ RootTreeGTDB214(args)
288
+