treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,300 @@
1
+ import random
2
+ import dendropy
3
+ import argparse
4
+ from ete3 import Tree
5
+
6
+
7
+ RootTreeGTDB220_usage = '''
8
+ ========================================== RootTreeGTDB220 example command ==========================================
9
+
10
+ TreeSAK RootTreeGTDB220 -add_root -d ar -tree ar53.tree -tax ar53.summary.tsv -db db_dir -o ar53.rooted.tree
11
+ TreeSAK RootTreeGTDB220 -add_root -d bac -tree bac120.tree -tax bac120.summary.tsv -db db_dir -o bac120.rooted.tree
12
+
13
+ # Need to download and decompress the following files to your database folder (provide with -db)
14
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_r220.tree.tar.gz
15
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_r220.tree.tar.gz
16
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_metadata_r220.tsv.gz
17
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_metadata_r220.tsv.gz
18
+
19
+ =====================================================================================================================
20
+ '''
21
+
22
+
23
+ def get_smallest_outgroup(tree_object):
24
+
25
+ min_outgroup_leaf_num = 99999
26
+ for each_root_child in tree_object.children:
27
+ leaf_list = each_root_child.get_leaf_names()
28
+ if len(leaf_list) < min_outgroup_leaf_num:
29
+ min_outgroup_leaf_num = len(leaf_list)
30
+
31
+ out_group_leaf_list = []
32
+ for each_root_child in tree_object.children:
33
+ leaf_list = each_root_child.get_leaf_names()
34
+ if len(leaf_list) == min_outgroup_leaf_num:
35
+ out_group_leaf_list = leaf_list
36
+
37
+ return out_group_leaf_list
38
+
39
+
40
+ def sep_taxon_str(taxon_string):
41
+
42
+ taxon_string_split = taxon_string.strip().split(';')
43
+ taxon_p = taxon_string_split[1]
44
+ taxon_c = taxon_string_split[2]
45
+ taxon_o = taxon_string_split[3]
46
+ taxon_f = taxon_string_split[4]
47
+ taxon_g = taxon_string_split[5]
48
+
49
+ return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
50
+
51
+
52
+ def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
53
+
54
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
55
+
56
+ # subset tree
57
+ subset_tree = input_tree.copy()
58
+ subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
59
+
60
+ # rename leaf
61
+ for each_leaf in subset_tree:
62
+ leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
63
+ each_leaf.name = leaf_name_new
64
+
65
+ return subset_tree
66
+
67
+
68
+ def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
69
+
70
+ """
71
+ Reroot the tree using the given outgroup.
72
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
73
+
74
+ input_tree: File containing Newick tree to rerooted.
75
+ output_tree: Name of file for rerooted tree.
76
+ outgroup: Labels of taxa in outgroup.
77
+ """
78
+
79
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
80
+
81
+ outgroup_in_tree = set()
82
+ ingroup_leaves = set()
83
+ for n in tree.leaf_node_iter():
84
+ if n.taxon.label in out_group_list:
85
+ outgroup_in_tree.add(n.taxon)
86
+ else:
87
+ ingroup_leaves.add(n)
88
+
89
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
90
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
91
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
92
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
93
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
94
+ # this is not guaranteed.
95
+
96
+ mrca = tree.mrca(taxa=outgroup_in_tree)
97
+ mrca_leaves = len(mrca.leaf_nodes())
98
+ while True:
99
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
100
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
101
+ mrca = tree.mrca(taxa=outgroup_in_tree)
102
+ if len(mrca.leaf_nodes()) == mrca_leaves:
103
+ break
104
+ mrca_leaves = len(mrca.leaf_nodes())
105
+
106
+ if mrca.edge_length is not None:
107
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
108
+
109
+ # tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
110
+ tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
111
+ tree_out_string = tree_out_string.replace("'", "")
112
+
113
+ # add the root bar
114
+ if add_root_branch is True:
115
+ tree_out_string = '(' + tree_out_string
116
+ tree_out_string = tree_out_string.replace(');', '):0.02);')
117
+
118
+ # write out tree string
119
+ tree_file_rooted_handle = open(tree_file_rooted, 'w')
120
+ tree_file_rooted_handle.write(tree_out_string)
121
+ tree_file_rooted_handle.close()
122
+
123
+
124
+ def RootTreeGTDB220(args):
125
+
126
+ input_unrooted_tree = args['tree']
127
+ user_gnm_taxon = args['tax']
128
+ db_dir = args['db']
129
+ gnm_domain = args['d']
130
+ add_root_branch = args['add_root']
131
+ rooted_tree = args['o']
132
+
133
+ leaf_list = []
134
+ for leaf in Tree(input_unrooted_tree, format=1):
135
+ leaf_name = leaf.name
136
+ leaf_list.append(leaf_name)
137
+
138
+ # define file name
139
+ gtdb_ref_tree_ar = '%s/ar53_r220.tree' % db_dir
140
+ gtdb_ref_tree_bac = '%s/bac120_r220.tree' % db_dir
141
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r220.tsv' % db_dir
142
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r220.tsv' % db_dir
143
+
144
+ if gnm_domain == 'bac':
145
+ gtdb_ref_tree = gtdb_ref_tree_bac
146
+ gtdb_gnm_metadata = gtdb_gnm_meta_bac
147
+ elif gnm_domain == 'ar':
148
+ gtdb_ref_tree = gtdb_ref_tree_ar
149
+ gtdb_gnm_metadata = gtdb_gnm_meta_ar
150
+ else:
151
+ print('please provide either "ar" or "bac" to -d')
152
+ exit()
153
+
154
+ tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
155
+ ref_tree_gnm_list = tree.get_leaf_names()
156
+ ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
157
+
158
+ # read in user_gnm_taxon
159
+ user_gnm_taxon_dict_p = dict()
160
+ user_gnm_taxon_dict_c = dict()
161
+ user_gnm_taxon_dict_o = dict()
162
+ user_gnm_taxon_dict_f = dict()
163
+ user_gnm_taxon_dict_g = dict()
164
+ for each_gnm in open(user_gnm_taxon):
165
+ if not each_gnm.startswith('user_genome\t'):
166
+ each_gnm_split = each_gnm.strip().split('\t')
167
+ gnm_id = each_gnm_split[0]
168
+ gnm_taxon = each_gnm_split[1]
169
+
170
+ if gnm_id in leaf_list:
171
+ count_current_gnm = False
172
+ if gnm_domain == 'bac':
173
+ if 'd__Bacteria' in gnm_taxon:
174
+ count_current_gnm = True
175
+ elif gnm_domain == 'ar':
176
+ if 'd__Archaea' in gnm_taxon:
177
+ count_current_gnm = True
178
+
179
+ if count_current_gnm is True:
180
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
181
+
182
+ if gnm_p not in user_gnm_taxon_dict_p:
183
+ user_gnm_taxon_dict_p[gnm_p] = set()
184
+ if gnm_c not in user_gnm_taxon_dict_c:
185
+ user_gnm_taxon_dict_c[gnm_c] = set()
186
+ if gnm_o not in user_gnm_taxon_dict_o:
187
+ user_gnm_taxon_dict_o[gnm_o] = set()
188
+ if gnm_f not in user_gnm_taxon_dict_f:
189
+ user_gnm_taxon_dict_f[gnm_f] = set()
190
+ if gnm_g not in user_gnm_taxon_dict_g:
191
+ user_gnm_taxon_dict_g[gnm_g] = set()
192
+
193
+ user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
194
+ user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
195
+ user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
196
+ user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
197
+ user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
198
+
199
+ # determine rooting rank, start from phylum
200
+ rooting_rank = ''
201
+ rooting_rank_taxon_dict = dict()
202
+ if len(user_gnm_taxon_dict_p) > 1:
203
+ rooting_rank = 'p'
204
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_p
205
+ elif len(user_gnm_taxon_dict_c) > 1:
206
+ rooting_rank = 'c'
207
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_c
208
+ elif len(user_gnm_taxon_dict_o) > 1:
209
+ rooting_rank = 'o'
210
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_o
211
+ elif len(user_gnm_taxon_dict_f) > 1:
212
+ rooting_rank = 'f'
213
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_f
214
+ elif len(user_gnm_taxon_dict_g) > 1:
215
+ rooting_rank = 'g'
216
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_g
217
+
218
+ if rooting_rank == '':
219
+ print('All user genomes are from the same genus, program exited!')
220
+ exit()
221
+
222
+ col_index = {}
223
+ canditate_gnms_rooting_rank = dict()
224
+ counted_taxons_rooting_rank = set()
225
+ for each_ref in open(gtdb_gnm_metadata):
226
+ each_ref_split = each_ref.strip().split('\t')
227
+ if each_ref.startswith('accession ambiguous_bases'):
228
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
229
+ else:
230
+ ref_accession = each_ref_split[0]
231
+ gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
232
+ if ref_accession in ref_tree_gnm_set:
233
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
234
+
235
+ gnm_rooting_rank = ''
236
+ if rooting_rank == 'p':
237
+ gnm_rooting_rank = gnm_p
238
+ elif rooting_rank == 'c':
239
+ gnm_rooting_rank = gnm_c
240
+ elif rooting_rank == 'o':
241
+ gnm_rooting_rank = gnm_o
242
+ elif rooting_rank == 'f':
243
+ gnm_rooting_rank = gnm_f
244
+ elif rooting_rank == 'g':
245
+ gnm_rooting_rank = gnm_g
246
+
247
+ # rooting_rank
248
+ if gnm_rooting_rank in rooting_rank_taxon_dict:
249
+ if gnm_rooting_rank not in counted_taxons_rooting_rank:
250
+ counted_taxons_rooting_rank.add(gnm_rooting_rank)
251
+ canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
252
+
253
+ ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
254
+
255
+ # get the smallest out group taxon set
256
+ smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
257
+
258
+ user_gnm_taxon_dict_rooting_rank = dict()
259
+ if rooting_rank == 'p':
260
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
261
+ elif rooting_rank == 'c':
262
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
263
+ elif rooting_rank == 'o':
264
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
265
+ elif rooting_rank == 'f':
266
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
267
+ elif rooting_rank == 'g':
268
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
269
+
270
+ # get the smallest out group genome set
271
+ out_group_gnm_set_1 = set()
272
+ out_group_gnm_set_2 = set()
273
+ for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
274
+ gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
275
+ if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
276
+ out_group_gnm_set_1.update(gnm_member_set)
277
+ else:
278
+ out_group_gnm_set_2.update(gnm_member_set)
279
+
280
+ # select the smaller one as outgroup
281
+ if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
282
+ out_group_gnm_set = out_group_gnm_set_1
283
+ else:
284
+ out_group_gnm_set = out_group_gnm_set_2
285
+
286
+ # root user tree with identified out group genomes
287
+ root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
288
+
289
+
290
+ if __name__ == '__main__':
291
+
292
+ RootTreeGTDB220_parser = argparse.ArgumentParser(usage=RootTreeGTDB220_usage)
293
+ RootTreeGTDB220_parser.add_argument('-tree', required=True, help='input unrooted tree')
294
+ RootTreeGTDB220_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
295
+ RootTreeGTDB220_parser.add_argument('-db', required=True, help='GTDB database files')
296
+ RootTreeGTDB220_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
297
+ RootTreeGTDB220_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
298
+ RootTreeGTDB220_parser.add_argument('-o', required=True, help='output folder')
299
+ args = vars(RootTreeGTDB220_parser.parse_args())
300
+ RootTreeGTDB220(args)
@@ -0,0 +1,300 @@
1
+ import random
2
+ import dendropy
3
+ import argparse
4
+ from ete3 import Tree
5
+
6
+
7
+ RootTreeGTDB226_usage = '''
8
+ ========================================== RootTreeGTDB226 example command ==========================================
9
+
10
+ TreeSAK RootTreeGTDB226 -add_root -d ar -tree ar53.tree -tax ar53.summary.tsv -db db_dir -o ar53.rooted.tree
11
+ TreeSAK RootTreeGTDB226 -add_root -d bac -tree bac120.tree -tax bac120.summary.tsv -db db_dir -o bac120.rooted.tree
12
+
13
+ # Need to download and decompress the following files to your database folder (provide with -db)
14
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/ar53_r226.tree.tar.gz
15
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/bac120_r226.tree.tar.gz
16
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/ar53_metadata_r226.tsv.gz
17
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/bac120_metadata_r226.tsv.gz
18
+
19
+ =====================================================================================================================
20
+ '''
21
+
22
+
23
+ def get_smallest_outgroup(tree_object):
24
+
25
+ min_outgroup_leaf_num = 99999
26
+ for each_root_child in tree_object.children:
27
+ leaf_list = each_root_child.get_leaf_names()
28
+ if len(leaf_list) < min_outgroup_leaf_num:
29
+ min_outgroup_leaf_num = len(leaf_list)
30
+
31
+ out_group_leaf_list = []
32
+ for each_root_child in tree_object.children:
33
+ leaf_list = each_root_child.get_leaf_names()
34
+ if len(leaf_list) == min_outgroup_leaf_num:
35
+ out_group_leaf_list = leaf_list
36
+
37
+ return out_group_leaf_list
38
+
39
+
40
+ def sep_taxon_str(taxon_string):
41
+
42
+ taxon_string_split = taxon_string.strip().split(';')
43
+ taxon_p = taxon_string_split[1]
44
+ taxon_c = taxon_string_split[2]
45
+ taxon_o = taxon_string_split[3]
46
+ taxon_f = taxon_string_split[4]
47
+ taxon_g = taxon_string_split[5]
48
+
49
+ return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
50
+
51
+
52
+ def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
53
+
54
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
55
+
56
+ # subset tree
57
+ subset_tree = input_tree.copy()
58
+ subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
59
+
60
+ # rename leaf
61
+ for each_leaf in subset_tree:
62
+ leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
63
+ each_leaf.name = leaf_name_new
64
+
65
+ return subset_tree
66
+
67
+
68
+ def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
69
+
70
+ """
71
+ Reroot the tree using the given outgroup.
72
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
73
+
74
+ input_tree: File containing Newick tree to rerooted.
75
+ output_tree: Name of file for rerooted tree.
76
+ outgroup: Labels of taxa in outgroup.
77
+ """
78
+
79
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
80
+
81
+ outgroup_in_tree = set()
82
+ ingroup_leaves = set()
83
+ for n in tree.leaf_node_iter():
84
+ if n.taxon.label in out_group_list:
85
+ outgroup_in_tree.add(n.taxon)
86
+ else:
87
+ ingroup_leaves.add(n)
88
+
89
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
90
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
91
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
92
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
93
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
94
+ # this is not guaranteed.
95
+
96
+ mrca = tree.mrca(taxa=outgroup_in_tree)
97
+ mrca_leaves = len(mrca.leaf_nodes())
98
+ while True:
99
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
100
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
101
+ mrca = tree.mrca(taxa=outgroup_in_tree)
102
+ if len(mrca.leaf_nodes()) == mrca_leaves:
103
+ break
104
+ mrca_leaves = len(mrca.leaf_nodes())
105
+
106
+ if mrca.edge_length is not None:
107
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
108
+
109
+ # tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
110
+ tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
111
+ tree_out_string = tree_out_string.replace("'", "")
112
+
113
+ # add the root bar
114
+ if add_root_branch is True:
115
+ tree_out_string = '(' + tree_out_string
116
+ tree_out_string = tree_out_string.replace(');', '):0.02);')
117
+
118
+ # write out tree string
119
+ tree_file_rooted_handle = open(tree_file_rooted, 'w')
120
+ tree_file_rooted_handle.write(tree_out_string)
121
+ tree_file_rooted_handle.close()
122
+
123
+
124
+ def RootTreeGTDB226(args):
125
+
126
+ input_unrooted_tree = args['tree']
127
+ user_gnm_taxon = args['tax']
128
+ db_dir = args['db']
129
+ gnm_domain = args['d']
130
+ add_root_branch = args['add_root']
131
+ rooted_tree = args['o']
132
+
133
+ leaf_list = []
134
+ for leaf in Tree(input_unrooted_tree, format=1):
135
+ leaf_name = leaf.name
136
+ leaf_list.append(leaf_name)
137
+
138
+ # define file name
139
+ gtdb_ref_tree_ar = '%s/ar53_r226.tree' % db_dir
140
+ gtdb_ref_tree_bac = '%s/bac120_r226.tree' % db_dir
141
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r226.tsv' % db_dir
142
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r226.tsv' % db_dir
143
+
144
+ if gnm_domain == 'bac':
145
+ gtdb_ref_tree = gtdb_ref_tree_bac
146
+ gtdb_gnm_metadata = gtdb_gnm_meta_bac
147
+ elif gnm_domain == 'ar':
148
+ gtdb_ref_tree = gtdb_ref_tree_ar
149
+ gtdb_gnm_metadata = gtdb_gnm_meta_ar
150
+ else:
151
+ print('please provide either "ar" or "bac" to -d')
152
+ exit()
153
+
154
+ tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
155
+ ref_tree_gnm_list = tree.get_leaf_names()
156
+ ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
157
+
158
+ # read in user_gnm_taxon
159
+ user_gnm_taxon_dict_p = dict()
160
+ user_gnm_taxon_dict_c = dict()
161
+ user_gnm_taxon_dict_o = dict()
162
+ user_gnm_taxon_dict_f = dict()
163
+ user_gnm_taxon_dict_g = dict()
164
+ for each_gnm in open(user_gnm_taxon):
165
+ if not each_gnm.startswith('user_genome\t'):
166
+ each_gnm_split = each_gnm.strip().split('\t')
167
+ gnm_id = each_gnm_split[0]
168
+ gnm_taxon = each_gnm_split[1]
169
+
170
+ if gnm_id in leaf_list:
171
+ count_current_gnm = False
172
+ if gnm_domain == 'bac':
173
+ if 'd__Bacteria' in gnm_taxon:
174
+ count_current_gnm = True
175
+ elif gnm_domain == 'ar':
176
+ if 'd__Archaea' in gnm_taxon:
177
+ count_current_gnm = True
178
+
179
+ if count_current_gnm is True:
180
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
181
+
182
+ if gnm_p not in user_gnm_taxon_dict_p:
183
+ user_gnm_taxon_dict_p[gnm_p] = set()
184
+ if gnm_c not in user_gnm_taxon_dict_c:
185
+ user_gnm_taxon_dict_c[gnm_c] = set()
186
+ if gnm_o not in user_gnm_taxon_dict_o:
187
+ user_gnm_taxon_dict_o[gnm_o] = set()
188
+ if gnm_f not in user_gnm_taxon_dict_f:
189
+ user_gnm_taxon_dict_f[gnm_f] = set()
190
+ if gnm_g not in user_gnm_taxon_dict_g:
191
+ user_gnm_taxon_dict_g[gnm_g] = set()
192
+
193
+ user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
194
+ user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
195
+ user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
196
+ user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
197
+ user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
198
+
199
+ # determine rooting rank, start from phylum
200
+ rooting_rank = ''
201
+ rooting_rank_taxon_dict = dict()
202
+ if len(user_gnm_taxon_dict_p) > 1:
203
+ rooting_rank = 'p'
204
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_p
205
+ elif len(user_gnm_taxon_dict_c) > 1:
206
+ rooting_rank = 'c'
207
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_c
208
+ elif len(user_gnm_taxon_dict_o) > 1:
209
+ rooting_rank = 'o'
210
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_o
211
+ elif len(user_gnm_taxon_dict_f) > 1:
212
+ rooting_rank = 'f'
213
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_f
214
+ elif len(user_gnm_taxon_dict_g) > 1:
215
+ rooting_rank = 'g'
216
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_g
217
+
218
+ if rooting_rank == '':
219
+ print('All user genomes are from the same genus, program exited!')
220
+ exit()
221
+
222
+ col_index = {}
223
+ canditate_gnms_rooting_rank = dict()
224
+ counted_taxons_rooting_rank = set()
225
+ for each_ref in open(gtdb_gnm_metadata):
226
+ each_ref_split = each_ref.strip().split('\t')
227
+ if each_ref.startswith('accession ambiguous_bases'):
228
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
229
+ else:
230
+ ref_accession = each_ref_split[0]
231
+ gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
232
+ if ref_accession in ref_tree_gnm_set:
233
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
234
+
235
+ gnm_rooting_rank = ''
236
+ if rooting_rank == 'p':
237
+ gnm_rooting_rank = gnm_p
238
+ elif rooting_rank == 'c':
239
+ gnm_rooting_rank = gnm_c
240
+ elif rooting_rank == 'o':
241
+ gnm_rooting_rank = gnm_o
242
+ elif rooting_rank == 'f':
243
+ gnm_rooting_rank = gnm_f
244
+ elif rooting_rank == 'g':
245
+ gnm_rooting_rank = gnm_g
246
+
247
+ # rooting_rank
248
+ if gnm_rooting_rank in rooting_rank_taxon_dict:
249
+ if gnm_rooting_rank not in counted_taxons_rooting_rank:
250
+ counted_taxons_rooting_rank.add(gnm_rooting_rank)
251
+ canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
252
+
253
+ ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
254
+
255
+ # get the smallest out group taxon set
256
+ smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
257
+
258
+ user_gnm_taxon_dict_rooting_rank = dict()
259
+ if rooting_rank == 'p':
260
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
261
+ elif rooting_rank == 'c':
262
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
263
+ elif rooting_rank == 'o':
264
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
265
+ elif rooting_rank == 'f':
266
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
267
+ elif rooting_rank == 'g':
268
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
269
+
270
+ # get the smallest out group genome set
271
+ out_group_gnm_set_1 = set()
272
+ out_group_gnm_set_2 = set()
273
+ for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
274
+ gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
275
+ if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
276
+ out_group_gnm_set_1.update(gnm_member_set)
277
+ else:
278
+ out_group_gnm_set_2.update(gnm_member_set)
279
+
280
+ # select the smaller one as outgroup
281
+ if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
282
+ out_group_gnm_set = out_group_gnm_set_1
283
+ else:
284
+ out_group_gnm_set = out_group_gnm_set_2
285
+
286
+ # root user tree with identified out group genomes
287
+ root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
288
+
289
+
290
+ if __name__ == '__main__':
291
+
292
+ RootTreeGTDB226_parser = argparse.ArgumentParser(usage=RootTreeGTDB226_usage)
293
+ RootTreeGTDB226_parser.add_argument('-tree', required=True, help='input unrooted tree')
294
+ RootTreeGTDB226_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
295
+ RootTreeGTDB226_parser.add_argument('-db', required=True, help='GTDB database files')
296
+ RootTreeGTDB226_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
297
+ RootTreeGTDB226_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
298
+ RootTreeGTDB226_parser.add_argument('-o', required=True, help='output folder')
299
+ args = vars(RootTreeGTDB226_parser.parse_args())
300
+ RootTreeGTDB226(args)
@@ -0,0 +1,16 @@
1
+
2
+ SequentialDating_usage = '''
3
+ ======================== SequentialDating example commands ========================
4
+
5
+ TreeSAK SequentialDating -h
6
+
7
+ ===================================================================================
8
+ '''
9
+
10
+
11
+ def SequentialDating():
12
+
13
+ pass
14
+
15
+
16
+ SequentialDating()