treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,300 @@
1
+ import random
2
+ import dendropy
3
+ import argparse
4
+ from ete3 import Tree
5
+
6
+
7
+ RootTreeGTDB220_usage = '''
8
+ ========================================== RootTreeGTDB220 example command ==========================================
9
+
10
+ TreeSAK RootTreeGTDB220 -add_root -d ar -tree ar53.tree -tax ar53.summary.tsv -db db_dir -o ar53.rooted.tree
11
+ TreeSAK RootTreeGTDB220 -add_root -d bac -tree bac120.tree -tax bac120.summary.tsv -db db_dir -o bac120.rooted.tree
12
+
13
+ # Need to download and decompress the following files to your database folder (provide with -db)
14
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_r220.tree.tar.gz
15
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_r220.tree.tar.gz
16
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/ar53_metadata_r220.tsv.gz
17
+ https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/bac120_metadata_r220.tsv.gz
18
+
19
+ =====================================================================================================================
20
+ '''
21
+
22
+
23
+ def get_smallest_outgroup(tree_object):
24
+
25
+ min_outgroup_leaf_num = 99999
26
+ for each_root_child in tree_object.children:
27
+ leaf_list = each_root_child.get_leaf_names()
28
+ if len(leaf_list) < min_outgroup_leaf_num:
29
+ min_outgroup_leaf_num = len(leaf_list)
30
+
31
+ out_group_leaf_list = []
32
+ for each_root_child in tree_object.children:
33
+ leaf_list = each_root_child.get_leaf_names()
34
+ if len(leaf_list) == min_outgroup_leaf_num:
35
+ out_group_leaf_list = leaf_list
36
+
37
+ return out_group_leaf_list
38
+
39
+
40
+ def sep_taxon_str(taxon_string):
41
+
42
+ taxon_string_split = taxon_string.strip().split(';')
43
+ taxon_p = taxon_string_split[1]
44
+ taxon_c = taxon_string_split[2]
45
+ taxon_o = taxon_string_split[3]
46
+ taxon_f = taxon_string_split[4]
47
+ taxon_g = taxon_string_split[5]
48
+
49
+ return taxon_p, taxon_c, taxon_o, taxon_f, taxon_g
50
+
51
+
52
+ def subset_and_rename_tree(tree_file_in, to_keep_leaf_list, rename_dict):
53
+
54
+ input_tree = Tree(tree_file_in, quoted_node_names=True, format=1)
55
+
56
+ # subset tree
57
+ subset_tree = input_tree.copy()
58
+ subset_tree.prune(to_keep_leaf_list, preserve_branch_length=True)
59
+
60
+ # rename leaf
61
+ for each_leaf in subset_tree:
62
+ leaf_name_new = rename_dict.get(each_leaf.name, each_leaf.name)
63
+ each_leaf.name = leaf_name_new
64
+
65
+ return subset_tree
66
+
67
+
68
+ def root_with_outgroup(input_tree, out_group_list, add_root_branch, tree_file_rooted):
69
+
70
+ """
71
+ Reroot the tree using the given outgroup.
72
+ modified based on: https://github.com/Ecogenomics/GTDBTk/blob/master/gtdbtk/reroot_tree.py
73
+
74
+ input_tree: File containing Newick tree to rerooted.
75
+ output_tree: Name of file for rerooted tree.
76
+ outgroup: Labels of taxa in outgroup.
77
+ """
78
+
79
+ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True)
80
+
81
+ outgroup_in_tree = set()
82
+ ingroup_leaves = set()
83
+ for n in tree.leaf_node_iter():
84
+ if n.taxon.label in out_group_list:
85
+ outgroup_in_tree.add(n.taxon)
86
+ else:
87
+ ingroup_leaves.add(n)
88
+
89
+ # Since finding the MRCA is a rooted tree operation, the tree is first rerooted on an ingroup taxa. This
90
+ # ensures the MRCA of the outgroup can be identified so long as the outgroup is monophyletic. If the
91
+ # outgroup is polyphyletic trying to root on it is ill-defined. To try and pick a "good" root for
92
+ # polyphyletic outgroups, random ingroup taxa are selected until two of them give the same size
93
+ # lineage. This will, likely, be the smallest bipartition possible for the given outgroup though
94
+ # this is not guaranteed.
95
+
96
+ mrca = tree.mrca(taxa=outgroup_in_tree)
97
+ mrca_leaves = len(mrca.leaf_nodes())
98
+ while True:
99
+ rnd_ingroup = random.sample(list(ingroup_leaves), 1)[0]
100
+ tree.reroot_at_edge(rnd_ingroup.edge, length1=0.5 * rnd_ingroup.edge_length, length2=0.5 * rnd_ingroup.edge_length)
101
+ mrca = tree.mrca(taxa=outgroup_in_tree)
102
+ if len(mrca.leaf_nodes()) == mrca_leaves:
103
+ break
104
+ mrca_leaves = len(mrca.leaf_nodes())
105
+
106
+ if mrca.edge_length is not None:
107
+ tree.reroot_at_edge(mrca.edge, length1=0.5 * mrca.edge_length, length2=0.5 * mrca.edge_length)
108
+
109
+ # tree.write_to_path(tree_file_rooted, schema='newick', suppress_rooting=True, unquoted_underscores=True)
110
+ tree_out_string = tree.as_string(schema='newick', suppress_rooting=True, unquoted_underscores=True)
111
+ tree_out_string = tree_out_string.replace("'", "")
112
+
113
+ # add the root bar
114
+ if add_root_branch is True:
115
+ tree_out_string = '(' + tree_out_string
116
+ tree_out_string = tree_out_string.replace(');', '):0.02);')
117
+
118
+ # write out tree string
119
+ tree_file_rooted_handle = open(tree_file_rooted, 'w')
120
+ tree_file_rooted_handle.write(tree_out_string)
121
+ tree_file_rooted_handle.close()
122
+
123
+
124
+ def RootTreeGTDB220(args):
125
+
126
+ input_unrooted_tree = args['tree']
127
+ user_gnm_taxon = args['tax']
128
+ db_dir = args['db']
129
+ gnm_domain = args['d']
130
+ add_root_branch = args['add_root']
131
+ rooted_tree = args['o']
132
+
133
+ leaf_list = []
134
+ for leaf in Tree(input_unrooted_tree, format=1):
135
+ leaf_name = leaf.name
136
+ leaf_list.append(leaf_name)
137
+
138
+ # define file name
139
+ gtdb_ref_tree_ar = '%s/ar53_r220.tree' % db_dir
140
+ gtdb_ref_tree_bac = '%s/bac120_r220.tree' % db_dir
141
+ gtdb_gnm_meta_ar = '%s/ar53_metadata_r220.tsv' % db_dir
142
+ gtdb_gnm_meta_bac = '%s/bac120_metadata_r220.tsv' % db_dir
143
+
144
+ if gnm_domain == 'bac':
145
+ gtdb_ref_tree = gtdb_ref_tree_bac
146
+ gtdb_gnm_metadata = gtdb_gnm_meta_bac
147
+ elif gnm_domain == 'ar':
148
+ gtdb_ref_tree = gtdb_ref_tree_ar
149
+ gtdb_gnm_metadata = gtdb_gnm_meta_ar
150
+ else:
151
+ print('please provide either "ar" or "bac" to -d')
152
+ exit()
153
+
154
+ tree = Tree(gtdb_ref_tree, quoted_node_names=True, format=1)
155
+ ref_tree_gnm_list = tree.get_leaf_names()
156
+ ref_tree_gnm_set = {i for i in ref_tree_gnm_list}
157
+
158
+ # read in user_gnm_taxon
159
+ user_gnm_taxon_dict_p = dict()
160
+ user_gnm_taxon_dict_c = dict()
161
+ user_gnm_taxon_dict_o = dict()
162
+ user_gnm_taxon_dict_f = dict()
163
+ user_gnm_taxon_dict_g = dict()
164
+ for each_gnm in open(user_gnm_taxon):
165
+ if not each_gnm.startswith('user_genome\t'):
166
+ each_gnm_split = each_gnm.strip().split('\t')
167
+ gnm_id = each_gnm_split[0]
168
+ gnm_taxon = each_gnm_split[1]
169
+
170
+ if gnm_id in leaf_list:
171
+ count_current_gnm = False
172
+ if gnm_domain == 'bac':
173
+ if 'd__Bacteria' in gnm_taxon:
174
+ count_current_gnm = True
175
+ elif gnm_domain == 'ar':
176
+ if 'd__Archaea' in gnm_taxon:
177
+ count_current_gnm = True
178
+
179
+ if count_current_gnm is True:
180
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gnm_taxon)
181
+
182
+ if gnm_p not in user_gnm_taxon_dict_p:
183
+ user_gnm_taxon_dict_p[gnm_p] = set()
184
+ if gnm_c not in user_gnm_taxon_dict_c:
185
+ user_gnm_taxon_dict_c[gnm_c] = set()
186
+ if gnm_o not in user_gnm_taxon_dict_o:
187
+ user_gnm_taxon_dict_o[gnm_o] = set()
188
+ if gnm_f not in user_gnm_taxon_dict_f:
189
+ user_gnm_taxon_dict_f[gnm_f] = set()
190
+ if gnm_g not in user_gnm_taxon_dict_g:
191
+ user_gnm_taxon_dict_g[gnm_g] = set()
192
+
193
+ user_gnm_taxon_dict_p[gnm_p].add(gnm_id)
194
+ user_gnm_taxon_dict_c[gnm_c].add(gnm_id)
195
+ user_gnm_taxon_dict_o[gnm_o].add(gnm_id)
196
+ user_gnm_taxon_dict_f[gnm_f].add(gnm_id)
197
+ user_gnm_taxon_dict_g[gnm_g].add(gnm_id)
198
+
199
+ # determine rooting rank, start from phylum
200
+ rooting_rank = ''
201
+ rooting_rank_taxon_dict = dict()
202
+ if len(user_gnm_taxon_dict_p) > 1:
203
+ rooting_rank = 'p'
204
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_p
205
+ elif len(user_gnm_taxon_dict_c) > 1:
206
+ rooting_rank = 'c'
207
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_c
208
+ elif len(user_gnm_taxon_dict_o) > 1:
209
+ rooting_rank = 'o'
210
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_o
211
+ elif len(user_gnm_taxon_dict_f) > 1:
212
+ rooting_rank = 'f'
213
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_f
214
+ elif len(user_gnm_taxon_dict_g) > 1:
215
+ rooting_rank = 'g'
216
+ rooting_rank_taxon_dict = user_gnm_taxon_dict_g
217
+
218
+ if rooting_rank == '':
219
+ print('All user genomes are from the same genus, program exited!')
220
+ exit()
221
+
222
+ col_index = {}
223
+ canditate_gnms_rooting_rank = dict()
224
+ counted_taxons_rooting_rank = set()
225
+ for each_ref in open(gtdb_gnm_metadata):
226
+ each_ref_split = each_ref.strip().split('\t')
227
+ if each_ref.startswith('accession ambiguous_bases'):
228
+ col_index = {key: i for i, key in enumerate(each_ref_split)}
229
+ else:
230
+ ref_accession = each_ref_split[0]
231
+ gtdb_taxonomy = each_ref_split[col_index['gtdb_taxonomy']]
232
+ if ref_accession in ref_tree_gnm_set:
233
+ gnm_p, gnm_c, gnm_o, gnm_f, gnm_g = sep_taxon_str(gtdb_taxonomy)
234
+
235
+ gnm_rooting_rank = ''
236
+ if rooting_rank == 'p':
237
+ gnm_rooting_rank = gnm_p
238
+ elif rooting_rank == 'c':
239
+ gnm_rooting_rank = gnm_c
240
+ elif rooting_rank == 'o':
241
+ gnm_rooting_rank = gnm_o
242
+ elif rooting_rank == 'f':
243
+ gnm_rooting_rank = gnm_f
244
+ elif rooting_rank == 'g':
245
+ gnm_rooting_rank = gnm_g
246
+
247
+ # rooting_rank
248
+ if gnm_rooting_rank in rooting_rank_taxon_dict:
249
+ if gnm_rooting_rank not in counted_taxons_rooting_rank:
250
+ counted_taxons_rooting_rank.add(gnm_rooting_rank)
251
+ canditate_gnms_rooting_rank[ref_accession] = gnm_rooting_rank
252
+
253
+ ref_tree_rooting_rank = subset_and_rename_tree(gtdb_ref_tree, canditate_gnms_rooting_rank, canditate_gnms_rooting_rank)
254
+
255
+ # get the smallest out group taxon set
256
+ smallest_outgroup_taxon_list = get_smallest_outgroup(ref_tree_rooting_rank)
257
+
258
+ user_gnm_taxon_dict_rooting_rank = dict()
259
+ if rooting_rank == 'p':
260
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_p
261
+ elif rooting_rank == 'c':
262
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_c
263
+ elif rooting_rank == 'o':
264
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_o
265
+ elif rooting_rank == 'f':
266
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_f
267
+ elif rooting_rank == 'g':
268
+ user_gnm_taxon_dict_rooting_rank = user_gnm_taxon_dict_g
269
+
270
+ # get the smallest out group genome set
271
+ out_group_gnm_set_1 = set()
272
+ out_group_gnm_set_2 = set()
273
+ for each_rooting_rank_taxon in user_gnm_taxon_dict_rooting_rank:
274
+ gnm_member_set = user_gnm_taxon_dict_rooting_rank[each_rooting_rank_taxon]
275
+ if each_rooting_rank_taxon in smallest_outgroup_taxon_list:
276
+ out_group_gnm_set_1.update(gnm_member_set)
277
+ else:
278
+ out_group_gnm_set_2.update(gnm_member_set)
279
+
280
+ # select the smaller one as outgroup
281
+ if len(out_group_gnm_set_1) < len(out_group_gnm_set_2):
282
+ out_group_gnm_set = out_group_gnm_set_1
283
+ else:
284
+ out_group_gnm_set = out_group_gnm_set_2
285
+
286
+ # root user tree with identified out group genomes
287
+ root_with_outgroup(input_unrooted_tree, out_group_gnm_set, add_root_branch, rooted_tree)
288
+
289
+
290
+ if __name__ == '__main__':
291
+
292
+ RootTreeGTDB220_parser = argparse.ArgumentParser(usage=RootTreeGTDB220_usage)
293
+ RootTreeGTDB220_parser.add_argument('-tree', required=True, help='input unrooted tree')
294
+ RootTreeGTDB220_parser.add_argument('-tax', required=False, default='fna', help='leaf taxon')
295
+ RootTreeGTDB220_parser.add_argument('-db', required=True, help='GTDB database files')
296
+ RootTreeGTDB220_parser.add_argument('-d', required=False, default=None, help='domain, either ar or bac')
297
+ RootTreeGTDB220_parser.add_argument('-add_root', required=False, action='store_true', help='add the root branch')
298
+ RootTreeGTDB220_parser.add_argument('-o', required=True, help='output folder')
299
+ args = vars(RootTreeGTDB220_parser.parse_args())
300
+ RootTreeGTDB220(args)
@@ -0,0 +1,16 @@
1
+
2
+ SequentialDating_usage = '''
3
+ ======================== SequentialDating example commands ========================
4
+
5
+ TreeSAK SequentialDating -h
6
+
7
+ ===================================================================================
8
+ '''
9
+
10
+
11
+ def SequentialDating():
12
+
13
+ pass
14
+
15
+
16
+ SequentialDating()
@@ -0,0 +1,157 @@
1
+ import os
2
+ import argparse
3
+
4
+
5
+ def sep_path_basename_ext(file_in):
6
+
7
+ # separate path and file name
8
+ f_path, file_name = os.path.split(file_in)
9
+ if f_path == '':
10
+ f_path = '.'
11
+
12
+ # separate file basename and extension
13
+ f_base, f_ext = os.path.splitext(file_name)
14
+
15
+ return f_path, f_base, f_ext
16
+
17
+
18
+ SingleAleHGT_usage = '''
19
+ ============================================ SingleAleHGT example commands ============================================
20
+
21
+ TreeSAK SingleAleHGT -i concatenated.fasta -s genome.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd
22
+
23
+ =======================================================================================================================
24
+ '''
25
+
26
+ def SingleAleHGT(args):
27
+
28
+ faa_in = args['faa']
29
+ msa_in = args['msa']
30
+ op_dir = args['o']
31
+ genome_tree_file_rooted = args['s']
32
+ API_key = args['api']
33
+ hgt_freq_cutoff = args['fc']
34
+ ar_phylum_color_code_txt = args['color']
35
+ genome_taxon_txt = args['c']
36
+ force_overwrite = args['f']
37
+ trim_msa = args['trim']
38
+ docker_image = args['docker']
39
+ num_threads = args['t']
40
+
41
+ ######################################## check input files #######################################
42
+
43
+ # if docker_image is True, check if docker is activated
44
+ if (faa_in is not None) and (msa_in is None):
45
+ f_path, f_base, f_ext = sep_path_basename_ext(faa_in)
46
+ elif (faa_in is None) and (msa_in is not None):
47
+ f_path, f_base, f_ext = sep_path_basename_ext(msa_in)
48
+ else:
49
+ print('Please specify either -faa or -msa, program exited!')
50
+ exit()
51
+
52
+ ######################################## define file name ########################################
53
+
54
+ ale1_op_dir = '%s/ALE1_op_dir' % op_dir
55
+ ale2_op_dir = '%s/ALE2_op_dir' % op_dir
56
+ ale4_op_dir = '%s/ALE4_op_dir' % op_dir
57
+ log_txt = '%s/log.txt' % op_dir
58
+ msa_file = '%s/%s.aln' % (ale1_op_dir, f_base)
59
+ msa_trimmed = '%s/%s_trimmed.aln' % (ale1_op_dir, f_base)
60
+ tree_prefix = '%s/%s' % (ale1_op_dir, f_base)
61
+
62
+ ###################################### create output folder ######################################
63
+
64
+ if os.path.isdir(op_dir) is True:
65
+ if force_overwrite is True:
66
+ os.system('rm -r %s' % op_dir)
67
+ else:
68
+ print('%s exist, program exited!' % op_dir)
69
+ exit()
70
+ os.mkdir(op_dir)
71
+ os.mkdir(ale1_op_dir)
72
+
73
+ ##################################################################################################
74
+
75
+ # run mafft-einsi
76
+ if (faa_in is not None) and (msa_in is None):
77
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_threads, faa_in, msa_file)
78
+
79
+ with open(log_txt, 'a') as log_txt_handle:
80
+ log_txt_handle.write(mafft_cmd + '\n')
81
+ os.system(mafft_cmd)
82
+ msa_file_for_next_step = msa_file
83
+ else:
84
+ msa_file_for_next_step = msa_in
85
+
86
+ # run trimal
87
+ if trim_msa is True:
88
+ trimal_cmd = 'trimal -in %s -out %s -automated1' % (msa_file_for_next_step, msa_trimmed)
89
+ with open(log_txt, 'a') as log_txt_handle:
90
+ log_txt_handle.write(trimal_cmd + '\n')
91
+ os.system(trimal_cmd)
92
+ iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_trimmed, tree_prefix)
93
+ with open(log_txt, 'a') as log_txt_handle:
94
+ log_txt_handle.write(iqtree2_cmd + '\n')
95
+ os.system(iqtree2_cmd)
96
+ else:
97
+ iqtree2_cmd = 'iqtree2 -m LG+G+I -bb 1000 --wbtl -nt %s -s %s -pre %s' % (num_threads, msa_file_for_next_step, tree_prefix)
98
+ with open(log_txt, 'a') as log_txt_handle:
99
+ log_txt_handle.write(iqtree2_cmd + '\n')
100
+ os.system(iqtree2_cmd)
101
+
102
+ # run ALE2
103
+ ale2_cmd = 'TreeSAK ALE2 -i %s -s %s -t %s -f -runALE -docker %s -o %s' % (ale1_op_dir, genome_tree_file_rooted, num_threads, docker_image, ale2_op_dir)
104
+ with open(log_txt, 'a') as log_txt_handle:
105
+ log_txt_handle.write(ale2_cmd + '\n')
106
+ os.system(ale2_cmd)
107
+
108
+ # run ALE4
109
+ ale4_cmd = 'TreeSAK ALE4 -i1 %s -i2 %s -c %s -color %s -o %s -fc %s -f -api %s' % (ale1_op_dir, ale2_op_dir, genome_taxon_txt, ar_phylum_color_code_txt, ale4_op_dir, hgt_freq_cutoff, API_key)
110
+ with open(log_txt, 'a') as log_txt_handle:
111
+ log_txt_handle.write(ale4_cmd + '\n')
112
+ os.system(ale4_cmd)
113
+
114
+
115
+ if __name__ == '__main__':
116
+
117
+ SingleAleHGT_parser = argparse.ArgumentParser()
118
+ SingleAleHGT_parser.add_argument('-faa', required=False, default=None, help='input aa file, e.g., OMA0001.faa')
119
+ SingleAleHGT_parser.add_argument('-msa', required=False, default=None, help='input MSA file, e.g., OMA0001.aln')
120
+ SingleAleHGT_parser.add_argument('-o', required=True, help='output dir, e.g., SingleAleHGT_wd')
121
+ SingleAleHGT_parser.add_argument('-s', required=True, help='rooted species tree')
122
+ SingleAleHGT_parser.add_argument('-c', required=True, help='genome_taxon, GTDB format')
123
+ SingleAleHGT_parser.add_argument('-color', required=True, help='phylum color code')
124
+ SingleAleHGT_parser.add_argument('-fc', required=False, type=float, default=0.5, help='hgt_freq_cutoff, default: 0.5')
125
+ SingleAleHGT_parser.add_argument('-mld', required=False, type=int, default=5, help='donor_node_min_leaf_num, default: 5')
126
+ SingleAleHGT_parser.add_argument('-mlr', required=False, type=int, default=5, help='recipient_node_min_leaf_num, default: 5')
127
+ SingleAleHGT_parser.add_argument('-trim', required=False, action="store_true", help='trim MSA')
128
+ SingleAleHGT_parser.add_argument('-docker', required=False, default=None, help='Docker image, if ALE was installed with Docker, e.g., gregmich/alesuite_new')
129
+ SingleAleHGT_parser.add_argument('-itol', required=False, default='batch_access_tmp', help='iTOL project_name, default: batch_access_tmp')
130
+ SingleAleHGT_parser.add_argument('-api', required=True, help='iTOL API key')
131
+ SingleAleHGT_parser.add_argument('-t', required=False, type=int, default=6, help='number of threads, default: 6')
132
+ SingleAleHGT_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
133
+ args = vars(SingleAleHGT_parser.parse_args())
134
+ SingleAleHGT(args)
135
+
136
+
137
+ '''
138
+
139
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/ALE1_op_dir_OMA05484_OMA07484_trimmed
140
+ trimal -in ../ALE1_op_dir_OMA05484_OMA07484/concatenated.fasta -out concatenated.fasta -automated1
141
+ iqtree2 -m LG+G+I -bb 1000 --wbtl -nt 10 -s concatenated.fasta -pre OMA05484_OMA07484
142
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
143
+ TreeSAK ALE2 -i ALE1_op_dir_OMA05484_OMA07484_trimmed -s genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir_OMA05484_OMA07484_trimmed
144
+ TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
145
+
146
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA
147
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -msa ALE1_op_dir_OMA05484_OMA07484_trimmed/concatenated.fasta -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 9 -f -o demo_SingleAleHGT_wd -trim
148
+
149
+ cd /Users/songweizhi/Desktop/DateArTree/01_HGT_ALE_with_OMA/demo_SingleAleHGT_wd
150
+ TreeSAK ALE2 -i ALE1_op_dir -s ../genome_tree.newick -t 10 -f -runALE -docker gregmich/alesuite_new -o ALE2_op_dir
151
+ TreeSAK ALE4 -i1 ALE1_op_dir_OMA05484_OMA07484_trimmed -i2 ALE2_op_dir_OMA05484_OMA07484_trimmed -c genome_taxon.txt -color phylum_color.txt -o ALE4_op_dir_OMA05484_OMA07484_trimmed_0.01 -fc 0.01 -f -api S1kZZuDHc0d5M7J5vLnUNQ
152
+
153
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o demo_SingleAleHGT_wd -msa ALE1_op_dir/OMA15312.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
154
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -trim -docker gregmich/alesuite_new
155
+ /usr/local/bin/python3.7 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/SingleAleHGT.py -o OMA01402_ALE_HGT_wd_no_trim -msa ALE1_op_dir/OMA01402.aln -s genome_tree_rooted_noEU.treefile -fc 0.3 -c genome_taxon.txt -color phylum_color.txt -api S1kZZuDHc0d5M7J5vLnUNQ -t 10 -f -docker gregmich/alesuite_new
156
+
157
+ '''
@@ -0,0 +1,50 @@
1
+ import os
2
+ import argparse
3
+ from Bio import AlignIO
4
+
5
+
6
+ SingleLinePhy_usage = '''
7
+ ======== SingleLinePhy example commands ========
8
+
9
+ TreeSAK SingleLinePhy -i in.phy -o out.phy
10
+
11
+ ================================================
12
+ '''
13
+
14
+
15
+ def SingleLinePhy(args):
16
+
17
+ phy_in = args['i']
18
+ phy_out = args['o']
19
+
20
+ # check input file
21
+ if os.path.isfile(phy_in) is False:
22
+ print('input file not found, program exited!')
23
+ exit()
24
+
25
+ alignment = AlignIO.read(phy_in, 'phylip-relaxed')
26
+
27
+ max_seq_id_len = 0
28
+ for each_seq in alignment:
29
+ seq_id_len = len(each_seq.id)
30
+ if seq_id_len > max_seq_id_len:
31
+ max_seq_id_len = seq_id_len
32
+
33
+ with open(phy_out, 'w') as msa_out_handle:
34
+ msa_out_handle.write('%s %s\n' % (len(alignment), alignment.get_alignment_length()))
35
+ for each_seq in alignment:
36
+ seq_id = each_seq.id
37
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
38
+ msa_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
39
+
40
+ print('Done!')
41
+
42
+
43
+ if __name__ == '__main__':
44
+
45
+ # initialize the options parser
46
+ parser = argparse.ArgumentParser()
47
+ parser.add_argument('-i', required=True, help='input file')
48
+ parser.add_argument('-o', required=True, help='output file')
49
+ args = vars(parser.parse_args())
50
+ SingleLinePhy(args)
TreeSAK/SliceMSA.py ADDED
@@ -0,0 +1,142 @@
1
+ import os
2
+ import argparse
3
+ from Bio import AlignIO
4
+
5
+
6
+ SliceMSA_usage = '''
7
+ ========================= SliceMSA example commands =========================
8
+
9
+ TreeSAK SliceMSA -i 16S_aln.fasta -s 200-300 -o 16S_aln_200-300.fasta
10
+ TreeSAK SliceMSA -i 16S_aln.phylip -fi phylip-relaxed -s sections.txt -o SliceMSA_op -fo phylip-relaxed
11
+
12
+ # example
13
+ 200-300 select columns 200-300
14
+ -100 select columns 1-300
15
+ 500- select columns from 500 to the end
16
+
17
+ # Example of sections.txt (one section per line):
18
+ 200-300
19
+ -100
20
+ 500-
21
+
22
+ # Examples of alignment format (https://biopython.org/wiki/AlignIO):
23
+ fasta, phylip, phylip-relaxed, phylip-sequential, clustal
24
+
25
+ =============================================================================
26
+ '''
27
+
28
+
29
+ def msa2fasta(msa_object, fasta_out):
30
+
31
+ with open(fasta_out, 'w') as fasta_out_handle:
32
+ for each_seq in msa_object:
33
+ fasta_out_handle.write('>%s\n' % each_seq.id)
34
+ fasta_out_handle.write('%s\n' % str(each_seq.seq))
35
+
36
+
37
+ def msa2phylip(msa_object, phylip_out):
38
+
39
+ max_seq_id_len = 0
40
+ for each_seq in msa_object:
41
+ seq_id_len = len(each_seq.id)
42
+ if seq_id_len > max_seq_id_len:
43
+ max_seq_id_len = seq_id_len
44
+
45
+ with open(phylip_out, 'w') as phylip_out_handle:
46
+ phylip_out_handle.write('%s %s\n' % (len(msa_object), msa_object.get_alignment_length()))
47
+ for each_seq in msa_object:
48
+ seq_id = each_seq.id
49
+ seq_id_with_space = '%s%s' % (seq_id, ' ' * (max_seq_id_len + 2 - len(seq_id)))
50
+ phylip_out_handle.write('%s%s\n' % (seq_id_with_space, str(each_seq.seq)))
51
+
52
+
53
+ def SliceMSA(args):
54
+
55
+ msa_in_file = args['i']
56
+ aln_in_format = args['fi']
57
+ col_to_select_txt = args['s']
58
+ op_dir = args['o']
59
+ aln_out_format = args['fo']
60
+ force_overwriting = args['force']
61
+
62
+ aln_out_ext = 'fasta'
63
+ if aln_out_format == 'phylip-relaxed':
64
+ aln_out_ext = 'phylip'
65
+
66
+ if os.path.isfile(msa_in_file) is False:
67
+ print('Input MSA not found, program exited!')
68
+ exit()
69
+
70
+ # read in msa
71
+ msa_in = AlignIO.read(msa_in_file, aln_in_format)
72
+
73
+ # parse provided sections
74
+ section_to_select_list = []
75
+ if os.path.isfile(col_to_select_txt) is False:
76
+ col_to_select_txt_split = col_to_select_txt.strip().split('-')
77
+ if col_to_select_txt == '-':
78
+ section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
79
+ elif col_to_select_txt.startswith('-'):
80
+ section_to_select_list.append(['1', col_to_select_txt_split[1]])
81
+ elif col_to_select_txt.endswith('-'):
82
+ section_to_select_list.append([col_to_select_txt_split[0], str(msa_in.get_alignment_length())])
83
+ else:
84
+ section_to_select_list.append(col_to_select_txt_split)
85
+ else:
86
+ for each_section in open(col_to_select_txt):
87
+ each_section = each_section.strip()
88
+ each_section_split = each_section.strip().split('-')
89
+ if each_section == '-':
90
+ section_to_select_list.append(['1', str(msa_in.get_alignment_length())])
91
+ elif each_section.startswith('-'):
92
+ section_to_select_list.append(['1', each_section_split[1]])
93
+ elif each_section.endswith('-'):
94
+ section_to_select_list.append([each_section_split[0], str(msa_in.get_alignment_length())])
95
+ else:
96
+ section_to_select_list.append(each_section_split)
97
+
98
+ # check output folder
99
+ if len(section_to_select_list) > 1:
100
+ if os.path.isdir(op_dir) is True:
101
+ if force_overwriting is True:
102
+ os.system('rm -r %s' % op_dir)
103
+ else:
104
+ print('Output folder already exist, program exited!')
105
+ exit()
106
+ os.system('mkdir %s' % op_dir)
107
+
108
+ # write out sections
109
+ if len(section_to_select_list) == 1:
110
+ current_section = msa_in[:, (int(section_to_select_list[0][0]) - 1):(int(section_to_select_list[0][1]))]
111
+ if aln_out_ext == 'fasta':
112
+ msa2fasta(current_section, op_dir)
113
+ if aln_out_ext == 'phylip':
114
+ msa2phylip(current_section, op_dir)
115
+ else:
116
+ for each_section in section_to_select_list:
117
+
118
+ pwd_op_file = '%s/%s.%s' % (op_dir, '-'.join(each_section), aln_out_ext)
119
+ current_section = msa_in[:, (int(each_section[0])-1):(int(each_section[1]))]
120
+
121
+ # write out
122
+ if aln_out_ext == 'fasta':
123
+ msa2fasta(current_section, pwd_op_file)
124
+ if aln_out_ext == 'phylip':
125
+ msa2phylip(current_section, pwd_op_file)
126
+
127
+ print('MSA subset(s) exported to %s, Done!' % op_dir)
128
+
129
+
130
+ if __name__ == '__main__':
131
+
132
+ # arguments for rename_seq_parser
133
+ SliceMSA_parser = argparse.ArgumentParser()
134
+ SliceMSA_parser.add_argument('-i', required=True, help='input MSA in fasta format')
135
+ SliceMSA_parser.add_argument('-fi', required=False, default='fasta', help='format (NOT file extension) of input MSA, default: fasta')
136
+ SliceMSA_parser.add_argument('-s', required=True, help='columns to export, e.g. 200-300, -100, 50-')
137
+ SliceMSA_parser.add_argument('-o', required=True, help='output file or folder')
138
+ SliceMSA_parser.add_argument('-fo', required=False, default='fasta', help='format of output MSA, select from fasta and phylip-relaxed, default: fasta')
139
+ SliceMSA_parser.add_argument('-force', required=False, action="store_true", help='force overwrite existing output folder')
140
+ args = vars(SliceMSA_parser.parse_args())
141
+ SliceMSA(args)
142
+