treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,371 @@
1
+ import glob
2
+ import random
3
+ import os.path
4
+ import dendropy
5
+ import argparse
6
+ from ete3 import Tree
7
+
8
+
9
+ RootTreeGTDB_usage = '''
10
+ ========================================== RootTreeGTDB example command ==========================================
11
+
12
+ TreeSAK RootTreeGTDB -r r226 -add_root -db db_dir -d ar -tree ar53.tree -tax ar53.summary.tsv -o ar53.rooted.tree
13
+ TreeSAK RootTreeGTDB -r r226 -add_root -db db_dir -d bac -tree bac120.tree -tax bac120.summary.tsv -o bac120.rooted.tree
14
+
15
+ # Need to download and decompress the following files to your database folder (provide with -db)
16
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/ar53_r226.tree.tar.gz
17
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/bac120_r226.tree.tar.gz
18
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/ar53_metadata_r226.tsv.gz
19
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/bac120_metadata_r226.tsv.gz
20
+
21
+ ==================================================================================================================
22
+ '''
23
+
24
+ def sep_path_basename_ext(file_in):
25
+
26
+ f_path, f_name = os.path.split(file_in)
27
+ if f_path == '':
28
+ f_path = '.'
29
+ f_base, f_ext = os.path.splitext(f_name)
30
+ f_ext = f_ext[1:]
31
+
32
+ return f_name, f_path, f_base, f_ext
33
+
34
+
35
+ def get_smallest_outgroup(tree_object):
36
+
37
+ min_outgroup_leaf_num = 99999
38
+ for each_root_child in tree_object.children:
39
+ leaf_list = each_root_child.get_leaf_names()
40
+ if len(leaf_list) < min_outgroup_leaf_num:
41
+ min_outgroup_leaf_num = len(leaf_list)
42
+
43
+ out_group_leaf_list = []
44
+ for each_root_child in tree_object.children:
45
+ leaf_list = each_root_child.get_leaf_names()
46
+ if len(leaf_list) == min_outgroup_leaf_num:
47
+ out_group_leaf_list = leaf_list
48
+
49
+ return out_group_leaf_list
50
+
51
+
52
+ def sep_taxon_str(taxon_string):
53
+
54
+ taxon_string_split = taxon_string.strip().split(';')
55
+ taxon_p = taxon_string_split[1]
56
+ taxon_c = taxon_string_split[2]
57
+ taxon_o = taxon_string_split[3]
58
+ taxon_f = taxon_string_split[4]
59
+ taxon_g = taxon_string_split[5]
60
+
61
+ return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
62
+
63
+
64
+ def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
65
+
66
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
67
+
68
+ # subset tree
69
+ subset_tree = input_tree.copy()
70
+ subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
71
+
72
+ # rename leaf
73
+ for each_leaf in subset_tree:
74
+ leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
75
+ each_leaf.name = leaf_name_new
76
+
77
+ return subset_tree
78
+
79
+
80
+ def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
81
+
82
+ """
83
+ Reroot the tree using the given outgroup.
84
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
85
+
86
+ input_tree: File containing Newick tree to rerooted.
87
+ output_tree: Name of file for rerooted tree.
88
+ outgroup: Labels of taxa in outgroup.
89
+ """
90
+
91
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
92
+
93
+ outgroup_in_tree = set()
94
+ ingroup_leaves = set()
95
+ for n in tree.leaf_node_iter():
96
+ if n.taxon.label in out_group_list:
97
+ outgroup_in_tree.add(n.taxon)
98
+ else:
99
+ ingroup_leaves.add(n)
100
+
101
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
102
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
103
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
104
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
105
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
106
+ # this is not guaranteed.
107
+
108
+ mrca = tree.mrca(taxa=outgroup_in_tree)
109
+ mrca_leaves = len(mrca.leaf_nodes())
110
+ while True:
111
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
112
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
113
+ mrca = tree.mrca(taxa=outgroup_in_tree)
114
+ if len(mrca.leaf_nodes()) == mrca_leaves:
115
+ break
116
+ mrca_leaves = len(mrca.leaf_nodes())
117
+
118
+ if mrca.edge_length is not None:
119
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
120
+
121
+ # tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
122
+ tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
123
+ tree_out_string = tree_out_string.replace("'", "")
124
+
125
+ # add the root bar
126
+ if add_root_branch is True:
127
+ tree_out_string = '(' + tree_out_string
128
+ tree_out_string = tree_out_string.replace(');', '):0.02);')
129
+
130
+ # write out tree string
131
+ tree_file_rooted_handle = open(tree_file_rooted, 'w')
132
+ tree_file_rooted_handle.write(tree_out_string)
133
+ tree_file_rooted_handle.close()
134
+
135
+
136
+ def RootTreeGTDB_single_tree(input_unrooted_tree, user_gnm_taxon, db_dir, gnm_domain, add_root_branch, rooted_tree, gtdb_release):
137
+
138
+ missing_file_list = []
139
+ if os.path.isfile(input_unrooted_tree) is False:
140
+ missing_file_list.append(input_unrooted_tree)
141
+ if os.path.isdir(db_dir) is False:
142
+ missing_file_list.append(db_dir)
143
+ if len(missing_file_list) > 0:
144
+ print('Missing files:')
145
+ for missing_file in missing_file_list:
146
+ print(missing_file)
147
+ print('Program exited!')
148
+ exit()
149
+
150
+ leaf_list = []
151
+ for leaf in Tree(input_unrooted_tree, format=1):
152
+ leaf_name = leaf.name
153
+ leaf_list.append(leaf_name)
154
+
155
+ # define file name
156
+ if gtdb_release in ['r214', 'R214']:
157
+ gtdb_ref_tree_ar = '%s/ar53_r214.tree' % db_dir
158
+ gtdb_ref_tree_bac = '%s/bac120_r214.tree' % db_dir
159
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r214.tsv' % db_dir
160
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r214.tsv' % db_dir
161
+ elif gtdb_release in ['r220', 'R220']:
162
+ gtdb_ref_tree_ar = '%s/ar53_r220.tree' % db_dir
163
+ gtdb_ref_tree_bac = '%s/bac120_r220.tree' % db_dir
164
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r220.tsv' % db_dir
165
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r220.tsv' % db_dir
166
+ elif gtdb_release in ['r226', 'R226']:
167
+ gtdb_ref_tree_ar = '%s/ar53_r226.tree' % db_dir
168
+ gtdb_ref_tree_bac = '%s/bac120_r226.tree' % db_dir
169
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r226.tsv' % db_dir
170
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r226.tsv' % db_dir
171
+
172
+ if gnm_domain == 'bac':
173
+ gtdb_ref_tree = gtdb_ref_tree_bac
174
+ gtdb_gnm_metadata = gtdb_gnm_meta_bac
175
+ elif gnm_domain == 'ar':
176
+ gtdb_ref_tree = gtdb_ref_tree_ar
177
+ gtdb_gnm_metadata = gtdb_gnm_meta_ar
178
+ else:
179
+ print('please provide either "ar" or "bac" to -d')
180
+ exit()
181
+
182
+ tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
183
+ ref_tree_gnm_list = tree.get_leaf_names()
184
+ ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
185
+
186
+ # read in user_gnm_taxon
187
+ user_gnm_taxon_dict_p = dict()
188
+ user_gnm_taxon_dict_c = dict()
189
+ user_gnm_taxon_dict_o = dict()
190
+ user_gnm_taxon_dict_f = dict()
191
+ user_gnm_taxon_dict_g = dict()
192
+ for each_gnm in open(user_gnm_taxon):
193
+ if not each_gnm.startswith('user_genome\t'):
194
+ each_gnm_split = each_gnm.strip().split('\t')
195
+ gnm_id = each_gnm_split[0]
196
+ gnm_taxon = each_gnm_split[1]
197
+
198
+ if gnm_id in leaf_list:
199
+ count_current_gnm = False
200
+ if gnm_domain == 'bac':
201
+ if 'd__Bacteria' in gnm_taxon:
202
+ count_current_gnm = True
203
+ elif gnm_domain == 'ar':
204
+ if 'd__Archaea' in gnm_taxon:
205
+ count_current_gnm = True
206
+
207
+ if count_current_gnm is True:
208
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
209
+
210
+ if gnm_p not in user_gnm_taxon_dict_p:
211
+ user_gnm_taxon_dict_p[gnm_p] = set()
212
+ if gnm_c not in user_gnm_taxon_dict_c:
213
+ user_gnm_taxon_dict_c[gnm_c] = set()
214
+ if gnm_o not in user_gnm_taxon_dict_o:
215
+ user_gnm_taxon_dict_o[gnm_o] = set()
216
+ if gnm_f not in user_gnm_taxon_dict_f:
217
+ user_gnm_taxon_dict_f[gnm_f] = set()
218
+ if gnm_g not in user_gnm_taxon_dict_g:
219
+ user_gnm_taxon_dict_g[gnm_g] = set()
220
+
221
+ user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
222
+ user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
223
+ user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
224
+ user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
225
+ user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
226
+
227
+ # determine rooting rank, start from phylum
228
+ rooting_rank = ''
229
+ rooting_rank_taxon_dict = dict()
230
+ if len(user_gnm_taxon_dict_p) > 1:
231
+ rooting_rank = 'p'
232
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_p
233
+ elif len(user_gnm_taxon_dict_c) > 1:
234
+ rooting_rank = 'c'
235
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_c
236
+ elif len(user_gnm_taxon_dict_o) > 1:
237
+ rooting_rank = 'o'
238
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_o
239
+ elif len(user_gnm_taxon_dict_f) > 1:
240
+ rooting_rank = 'f'
241
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_f
242
+ elif len(user_gnm_taxon_dict_g) > 1:
243
+ rooting_rank = 'g'
244
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_g
245
+
246
+ if rooting_rank == '':
247
+ print('All user genomes are from the same genus, program exited!')
248
+ exit()
249
+
250
+ col_index = {}
251
+ canditate_gnms_rooting_rank = dict()
252
+ counted_taxons_rooting_rank = set()
253
+ for each_ref in open(gtdb_gnm_metadata):
254
+ each_ref_split = each_ref.strip().split('\t')
255
+ if each_ref.startswith('accession ambiguous_bases'):
256
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
257
+ else:
258
+ ref_accession = each_ref_split[0]
259
+ gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
260
+ if ref_accession in ref_tree_gnm_set:
261
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
262
+
263
+ gnm_rooting_rank = ''
264
+ if rooting_rank == 'p':
265
+ gnm_rooting_rank = gnm_p
266
+ elif rooting_rank == 'c':
267
+ gnm_rooting_rank = gnm_c
268
+ elif rooting_rank == 'o':
269
+ gnm_rooting_rank = gnm_o
270
+ elif rooting_rank == 'f':
271
+ gnm_rooting_rank = gnm_f
272
+ elif rooting_rank == 'g':
273
+ gnm_rooting_rank = gnm_g
274
+
275
+ # rooting_rank
276
+ if gnm_rooting_rank in rooting_rank_taxon_dict:
277
+ if gnm_rooting_rank not in counted_taxons_rooting_rank:
278
+ counted_taxons_rooting_rank.add(gnm_rooting_rank)
279
+ canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
280
+
281
+ ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
282
+
283
+ # get the smallest out group taxon set
284
+ smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
285
+
286
+ user_gnm_taxon_dict_rooting_rank = dict()
287
+ if rooting_rank == 'p':
288
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
289
+ elif rooting_rank == 'c':
290
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
291
+ elif rooting_rank == 'o':
292
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
293
+ elif rooting_rank == 'f':
294
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
295
+ elif rooting_rank == 'g':
296
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
297
+
298
+ # get the smallest out group genome set
299
+ out_group_gnm_set_1 = set()
300
+ out_group_gnm_set_2 = set()
301
+ for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
302
+ gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
303
+ if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
304
+ out_group_gnm_set_1.update(gnm_member_set)
305
+ else:
306
+ out_group_gnm_set_2.update(gnm_member_set)
307
+
308
+ # select the smaller one as outgroup
309
+ if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
310
+ out_group_gnm_set = out_group_gnm_set_1
311
+ else:
312
+ out_group_gnm_set = out_group_gnm_set_2
313
+
314
+ # root user tree with identified out group genomes
315
+ root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
316
+
317
+
318
+ def RootTreeGTDB(args):
319
+
320
+ input_tree_file_dir = args['tree']
321
+ tree_file_ext = args['x']
322
+ user_gnm_taxon = args['tax']
323
+ db_dir = args['db']
324
+ gnm_domain = args['d']
325
+ add_root_branch = args['add_root']
326
+ rooted_tree_file_dir = args['o']
327
+ force_overwrite = args['f']
328
+ gtdb_release = args['r']
329
+
330
+ if os.path.isfile(input_tree_file_dir) is True:
331
+ RootTreeGTDB_single_tree(input_tree_file_dir, user_gnm_taxon, db_dir, gnm_domain, add_root_branch, rooted_tree_file_dir, gtdb_release)
332
+ elif os.path.isdir(input_tree_file_dir) is True:
333
+ tree_file_re = '%s/*.%s' % (input_tree_file_dir, tree_file_ext)
334
+ tree_file_list = glob.glob(tree_file_re)
335
+ if len(tree_file_list) == 0:
336
+ print('No file found in %s, please make sure file extension is correct, program exited!' % input_tree_file_dir)
337
+ exit()
338
+
339
+ # create output folder
340
+ if os.path.isdir(rooted_tree_file_dir) is True:
341
+ if force_overwrite is True:
342
+ os.system('rm -r %s' % rooted_tree_file_dir)
343
+ else:
344
+ print('%s exist, program exited!' % rooted_tree_file_dir)
345
+ exit()
346
+ os.mkdir(rooted_tree_file_dir)
347
+
348
+ # root trees in batch
349
+ for each_tree_file in tree_file_list:
350
+ tree_f_name, tree_f_path, tree_f_base, tree_f_ext = sep_path_basename_ext(each_tree_file)
351
+ pwd_tree_out = '%s/%s.rooted.%s' % (rooted_tree_file_dir, tree_f_base, tree_f_ext)
352
+ RootTreeGTDB_single_tree(each_tree_file, user_gnm_taxon, db_dir, gnm_domain, add_root_branch, pwd_tree_out, gtdb_release)
353
+ else:
354
+ print('input tree file/folder not found, program exited!')
355
+ exit()
356
+
357
+
358
+ if __name__ == '__main__':
359
+
360
+ RootTreeGTDB_parser = argparse.ArgumentParser(usage=RootTreeGTDB_usage)
361
+ RootTreeGTDB_parser.add_argument('-tree', required=True, help='input unrooted tree file or folder')
362
+ RootTreeGTDB_parser.add_argument('-x', required=False, default='treefile', help='tree file extension, default is: treefile')
363
+ RootTreeGTDB_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
364
+ RootTreeGTDB_parser.add_argument('-db', required=True, help='GTDB database files')
365
+ RootTreeGTDB_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
366
+ RootTreeGTDB_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
367
+ RootTreeGTDB_parser.add_argument('-o', required=True, help='output folder')
368
+ RootTreeGTDB_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
369
+ RootTreeGTDB_parser.add_argument('-r', required=True, help='GTDB release, e.g., r220, r226')
370
+ args = vars(RootTreeGTDB_parser.parse_args())
371
+ RootTreeGTDB(args)
@@ -0,0 +1,288 @@
1
+ import os
2
+ import random
3
+ import dendropy
4
+ import argparse
5
+ from ete3 import Tree
6
+
7
+
8
+ RootTreeGTDB214_usage = '''
9
+ ========================================= RootTreeGTDB214 example command =========================================
10
+
11
+ TreeSAK RootTreeGTDB214 -tree ar53.unrooted.tree -tax ar53.summary.tsv -db db_dir -d ar -o ar53.rooted.tree
12
+ TreeSAK RootTreeGTDB214 -tree bac120.unrooted.tree -tax bac120.summary.tsv -db db_dir -d ar -o bac120.rooted.tree
13
+
14
+ # prepare GTDB database files
15
+ cd db_dir
16
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/ar53_r214.tree.tar.gz
17
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/bac120_r214.tree.tar.gz
18
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/ar53_metadata_r214.tsv.gz
19
+ wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/bac120_metadata_r214.tsv.gz
20
+ tar -xzvf ar53_r214.tree.tar.gz
21
+ tar -xzvf bac120_r214.tree.tar.gz
22
+ gunzip ar53_metadata_r214.tsv.gz
23
+ gunzip bac120_metadata_r214.tsv.gz
24
+
25
+ ================================================================================================================
26
+ '''
27
+
28
+
29
+ def get_smallest_outgroup(tree_object):
30
+
31
+ min_outgroup_leaf_num = 99999
32
+ for each_root_child in tree_object.children:
33
+ leaf_list = each_root_child.get_leaf_names()
34
+ if len(leaf_list) < min_outgroup_leaf_num:
35
+ min_outgroup_leaf_num = len(leaf_list)
36
+
37
+ out_group_leaf_list = []
38
+ for each_root_child in tree_object.children:
39
+ leaf_list = each_root_child.get_leaf_names()
40
+ if len(leaf_list) == min_outgroup_leaf_num:
41
+ out_group_leaf_list = leaf_list
42
+
43
+ return out_group_leaf_list
44
+
45
+
46
+ def sep_taxon_str(taxon_string):
47
+
48
+ taxon_string_split = taxon_string.strip().split(';')
49
+ taxon_p = taxon_string_split[1]
50
+ taxon_c = taxon_string_split[2]
51
+ taxon_o = taxon_string_split[3]
52
+ taxon_f = taxon_string_split[4]
53
+ taxon_g = taxon_string_split[5]
54
+
55
+ return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
56
+
57
+
58
+ def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
59
+
60
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
61
+
62
+ # subset tree
63
+ subset_tree = input_tree.copy()
64
+ subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
65
+
66
+ # rename leaf
67
+ for each_leaf in subset_tree:
68
+ leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
69
+ each_leaf.name = leaf_name_new
70
+
71
+ return subset_tree
72
+
73
+
74
+ def root_with_outgroup(input_tree, out_group_list, tree_file_rooted):
75
+
76
+ """
77
+ Reroot the tree using the given outgroup.
78
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
79
+
80
+ input_tree: File containing Newick tree to rerooted.
81
+ output_tree: Name of file for rerooted tree.
82
+ outgroup: Labels of taxa in outgroup.
83
+ """
84
+
85
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
86
+
87
+ outgroup_in_tree = set()
88
+ ingroup_leaves = set()
89
+ for n in tree.leaf_node_iter():
90
+ if n.taxon.label in out_group_list:
91
+ outgroup_in_tree.add(n.taxon)
92
+ else:
93
+ ingroup_leaves.add(n)
94
+
95
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
96
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
97
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
98
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
99
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
100
+ # this is not guaranteed.
101
+
102
+ mrca = tree.mrca(taxa=outgroup_in_tree)
103
+ mrca_leaves = len(mrca.leaf_nodes())
104
+ while True:
105
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
106
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
107
+ mrca = tree.mrca(taxa=outgroup_in_tree)
108
+ if len(mrca.leaf_nodes()) == mrca_leaves:
109
+ break
110
+
111
+ mrca_leaves = len(mrca.leaf_nodes())
112
+
113
+ if mrca.edge_length is not None:
114
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
115
+ tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
116
+
117
+
118
+ def RootTreeGTDB214(args):
119
+
120
+ input_unrooted_tree = args['tree']
121
+ user_gnm_taxon = args['tax']
122
+ db_dir = args['db']
123
+ gnm_domain = args['d']
124
+ rooted_tree = args['o']
125
+
126
+ # define file name
127
+ gtdb_ref_tree_ar = '%s/ar53_r214.tree' % db_dir
128
+ gtdb_ref_tree_bac = '%s/bac120_r214.tree' % db_dir
129
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r214.tsv' % db_dir
130
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r214.tsv' % db_dir
131
+
132
+ if gnm_domain == 'bac':
133
+ gtdb_ref_tree = gtdb_ref_tree_bac
134
+ gtdb_gnm_metadata = gtdb_gnm_meta_bac
135
+ elif gnm_domain == 'ar':
136
+ gtdb_ref_tree = gtdb_ref_tree_ar
137
+ gtdb_gnm_metadata = gtdb_gnm_meta_ar
138
+ else:
139
+ print('please provide either "ar" or "bac" to -d')
140
+ exit()
141
+
142
+ tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
143
+ ref_tree_gnm_list = tree.get_leaf_names()
144
+ ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
145
+
146
+ # read in user_gnm_taxon
147
+ user_gnm_taxon_dict_p = dict()
148
+ user_gnm_taxon_dict_c = dict()
149
+ user_gnm_taxon_dict_o = dict()
150
+ user_gnm_taxon_dict_f = dict()
151
+ user_gnm_taxon_dict_g = dict()
152
+ for each_gnm in open(user_gnm_taxon):
153
+ if not each_gnm.startswith('user_genome\t'):
154
+ each_gnm_split = each_gnm.strip().split('\t')
155
+ gnm_id = each_gnm_split[0]
156
+ gnm_taxon = each_gnm_split[1]
157
+
158
+ count_current_gnm = False
159
+ if gnm_domain == 'bac':
160
+ if 'd__Bacteria' in gnm_taxon:
161
+ count_current_gnm = True
162
+ elif gnm_domain == 'ar':
163
+ if 'd__Archaea' in gnm_taxon:
164
+ count_current_gnm = True
165
+
166
+ if count_current_gnm is True:
167
+
168
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
169
+
170
+ if gnm_p not in user_gnm_taxon_dict_p:
171
+ user_gnm_taxon_dict_p[gnm_p] = set()
172
+ if gnm_c not in user_gnm_taxon_dict_c:
173
+ user_gnm_taxon_dict_c[gnm_c] = set()
174
+ if gnm_o not in user_gnm_taxon_dict_o:
175
+ user_gnm_taxon_dict_o[gnm_o] = set()
176
+ if gnm_f not in user_gnm_taxon_dict_f:
177
+ user_gnm_taxon_dict_f[gnm_f] = set()
178
+ if gnm_g not in user_gnm_taxon_dict_g:
179
+ user_gnm_taxon_dict_g[gnm_g] = set()
180
+
181
+ user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
182
+ user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
183
+ user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
184
+ user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
185
+ user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
186
+
187
+ # determine rooting rank, start from phylum
188
+ rooting_rank = ''
189
+ rooting_rank_taxon_dict = dict()
190
+ if len(user_gnm_taxon_dict_p) > 1:
191
+ rooting_rank = 'p'
192
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_p
193
+ elif len(user_gnm_taxon_dict_c) > 1:
194
+ rooting_rank = 'c'
195
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_c
196
+ elif len(user_gnm_taxon_dict_o) > 1:
197
+ rooting_rank = 'o'
198
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_o
199
+ elif len(user_gnm_taxon_dict_f) > 1:
200
+ rooting_rank = 'f'
201
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_f
202
+ elif len(user_gnm_taxon_dict_g) > 1:
203
+ rooting_rank = 'g'
204
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_g
205
+
206
+ if rooting_rank == '':
207
+ print('All user genomes are from the same genus, program exited!')
208
+ exit()
209
+
210
+ col_index = {}
211
+ canditate_gnms_rooting_rank = dict()
212
+ counted_taxons_rooting_rank = set()
213
+ for each_ref in open(gtdb_gnm_metadata):
214
+ each_ref_split = each_ref.strip().split('\t')
215
+ if each_ref.startswith('accession ambiguous_bases'):
216
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
217
+ else:
218
+ ref_accession = each_ref_split[0]
219
+ gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
220
+ if ref_accession in ref_tree_gnm_set:
221
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
222
+
223
+ gnm_rooting_rank = ''
224
+ if rooting_rank == 'p':
225
+ gnm_rooting_rank = gnm_p
226
+ elif rooting_rank == 'c':
227
+ gnm_rooting_rank = gnm_c
228
+ elif rooting_rank == 'o':
229
+ gnm_rooting_rank = gnm_o
230
+ elif rooting_rank == 'f':
231
+ gnm_rooting_rank = gnm_f
232
+ elif rooting_rank == 'g':
233
+ gnm_rooting_rank = gnm_g
234
+
235
+ # rooting_rank
236
+ if gnm_rooting_rank in rooting_rank_taxon_dict:
237
+ if gnm_rooting_rank not in counted_taxons_rooting_rank:
238
+ counted_taxons_rooting_rank.add(gnm_rooting_rank)
239
+ canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
240
+
241
+ ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
242
+
243
+ # get the smallest out group taxon set
244
+ smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
245
+
246
+ user_gnm_taxon_dict_rooting_rank = dict()
247
+ if rooting_rank == 'p':
248
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
249
+ elif rooting_rank == 'c':
250
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
251
+ elif rooting_rank == 'o':
252
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
253
+ elif rooting_rank == 'f':
254
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
255
+ elif rooting_rank == 'g':
256
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
257
+
258
+ # get the smallest out group genome set
259
+ out_group_gnm_set_1 = set()
260
+ out_group_gnm_set_2 = set()
261
+ for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
262
+ gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
263
+ if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
264
+ out_group_gnm_set_1.update(gnm_member_set)
265
+ else:
266
+ out_group_gnm_set_2.update(gnm_member_set)
267
+
268
+ # select the smaller one as outgroup
269
+ if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
270
+ out_group_gnm_set = out_group_gnm_set_1
271
+ else:
272
+ out_group_gnm_set = out_group_gnm_set_2
273
+
274
+ # root user tree with identified out group genomes
275
+ root_with_outgroup(input_unrooted_tree, out_group_gnm_set, rooted_tree)
276
+
277
+
278
+ if __name__ == '__main__':
279
+
280
+ RootTreeGTDB214_parser = argparse.ArgumentParser(usage=RootTreeGTDB214_usage)
281
+ RootTreeGTDB214_parser.add_argument('-tree', required=True, help='input unrooted tree')
282
+ RootTreeGTDB214_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
283
+ RootTreeGTDB214_parser.add_argument('-db', required=True, help='GTDB database files')
284
+ RootTreeGTDB214_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
285
+ RootTreeGTDB214_parser.add_argument('-o', required=True, help='output folder')
286
+ args = vars(RootTreeGTDB214_parser.parse_args())
287
+ RootTreeGTDB214(args)
288
+