treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,299 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ from distutils.spawn import find_executable
7
+
8
+
9
+ MarkerSeq2Tree_usage = '''
10
+ ======================== MarkerSeq2Tree example commands ========================
11
+
12
+ Dependencies: mafft, trimal, bmge, perl and iqtree2
13
+
14
+ TreeSAK MarkerSeq2Tree -i best_25 -x fa -o op_dir -t 12 -f -bmge -prune 10,20,30
15
+
16
+ # Note
17
+ "chi2_prune" is performed if you specify "-prune".
18
+
19
+ =================================================================================
20
+ '''
21
+
22
+
23
+ def sep_path_basename_ext(file_in):
24
+
25
+ f_path, file_name = os.path.split(file_in)
26
+ if f_path == '':
27
+ f_path = '.'
28
+ f_base, f_ext = os.path.splitext(file_name)
29
+
30
+ return f_path, f_base, f_ext
31
+
32
+
33
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
34
+
35
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
36
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
37
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
38
+ msa_file_list_sorted = sorted(msa_file_list)
39
+
40
+ complete_gnm_set = set()
41
+ for each_msa_file in msa_file_list:
42
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
43
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
44
+ complete_gnm_set.add(each_seq.id)
45
+
46
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
47
+
48
+ # initialize concatenated msa dict
49
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
50
+ msa_len_dict = dict()
51
+ for each_msa_file in msa_file_list_sorted:
52
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
53
+
54
+ # read in msa
55
+ current_msa_len = 0
56
+ current_msa_len_set = set()
57
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
58
+ current_msa_seq_dict = dict()
59
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
60
+ complete_gnm_set.add(each_seq.id)
61
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
62
+ current_msa_len_set.add(len(each_seq.seq))
63
+ current_msa_len = len(each_seq.seq)
64
+
65
+ if len(current_msa_len_set) != 1:
66
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
67
+ exit()
68
+
69
+ msa_len_dict[gene_id] = current_msa_len
70
+
71
+ # add sequence to concatenated msa dict
72
+ for each_gnm in complete_gnm_list_sorted:
73
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
74
+ gnm_to_seq_dict[each_gnm] += msa_seq
75
+
76
+ # write out concatenated msa
77
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
78
+ for each_gnm in complete_gnm_list_sorted:
79
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
80
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
81
+ concatenated_msa_handle.close()
82
+
83
+ # write out partition file
84
+ end_pos = 0
85
+ partition_file_handle = open(partition_file, 'w')
86
+ for each_m in msa_file_list_sorted:
87
+ gene_id = each_m.split('.' + msa_ext)[0]
88
+ current_m_len = msa_len_dict[gene_id]
89
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
90
+ end_pos += current_m_len
91
+ partition_file_handle.close()
92
+
93
+ # convert msa in fasta to phy
94
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
95
+
96
+
97
+ def get_gap_stats(msa_in_fa, stats_txt):
98
+
99
+ gap_pct_dict = dict()
100
+ for each_seq in SeqIO.parse(msa_in_fa, 'fasta'):
101
+ seq_id = each_seq.id
102
+ seq_str = str(each_seq.seq)
103
+ gap_pct = seq_str.count('-')*100/len(seq_str)
104
+ gap_pct = float("{0:.2f}".format(gap_pct))
105
+ gap_pct_dict[seq_id] = gap_pct
106
+
107
+ gap_pct_sorted = sorted(gap_pct_dict.items(), key=lambda x:x[1])
108
+
109
+ stats_txt_handle = open(stats_txt, 'w')
110
+ stats_txt_handle.write('Sequence\tGap\n')
111
+ for each_seq in gap_pct_sorted:
112
+ stats_txt_handle.write('%s\t%s\n' % (each_seq[0], each_seq[1]))
113
+ stats_txt_handle.close()
114
+
115
+
116
+ def BMGE(msa_in, op_prefix, trim_model, entropy_score_cutoff):
117
+
118
+ # define file name
119
+ msa_out_phylip = '%s.BMGE.phylip' % op_prefix
120
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
121
+ msa_out_nexus = '%s.BMGE.nexus' % op_prefix
122
+ msa_out_html = '%s.BMGE.html' % op_prefix
123
+
124
+ # specify path to BMGE.jar
125
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
126
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
127
+
128
+ # run BMGE
129
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -op %s -of %s -on %s -oh %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_phylip, msa_out_fasta, msa_out_nexus, msa_out_html)
130
+ print('Running %s' % bmge_cmd)
131
+ os.system(bmge_cmd)
132
+
133
+
134
+ def pruneMSA(msa_in, conserved_cutoffs):
135
+
136
+ msa_path, msa_base, msa_ext = sep_path_basename_ext(msa_in)
137
+
138
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
139
+ alignment_pruner_pl = '%s/alignment_pruner.pl' % current_file_path
140
+ cutoff_list = conserved_cutoffs.split(',')
141
+
142
+ op_file_list = []
143
+ for each_cutoff in cutoff_list:
144
+ cutoff_formatted = str(float(each_cutoff)/100)
145
+ current_msa_out = '%s_chi2p%s%s' % (msa_path, msa_base, each_cutoff, msa_ext)
146
+ pwd_current_msa_out = '%s/%s_chi2p%s%s' % (msa_path, msa_base, each_cutoff, msa_ext)
147
+ perl_cmd = 'perl %s --file %s --chi2_prune f%s > %s' % (alignment_pruner_pl, msa_in, cutoff_formatted, pwd_current_msa_out)
148
+ perl_cmd_for_report = 'perl %s --file %s --chi2_prune f%s > %s' % ('alignment_pruner.pl', msa_in, cutoff_formatted, pwd_current_msa_out)
149
+ op_file_list.append(current_msa_out)
150
+ print(perl_cmd_for_report)
151
+ os.system(perl_cmd)
152
+
153
+ # report
154
+ print('Pruned MSA exported to:')
155
+ print('\n'.join(op_file_list))
156
+
157
+ return op_file_list
158
+
159
+
160
+ def MarkerSeq2Tree(args):
161
+
162
+ marker_seq_dir = args['i']
163
+ marker_seq_ext = args['x']
164
+ op_dir = args['o']
165
+ num_of_threads = args['t']
166
+ run_bmge = args['bmge']
167
+ trim_with_bmge = args['bmge']
168
+ bmge_trim_model = args['bmge_m']
169
+ bmge_entropy_score_cutoff = args['bmge_esc']
170
+ force_overwrite = args['f']
171
+ alignment_pruner_cutoffs = args['prune']
172
+
173
+ # specify path to BMGE.jar
174
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
175
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
176
+
177
+ # check dependencies
178
+ not_detected_programs = []
179
+ for needed_program in ['mafft-einsi', 'trimal', 'iqtree2']:
180
+ if find_executable(needed_program) is None:
181
+ not_detected_programs.append(needed_program)
182
+ if not_detected_programs != []:
183
+ print('%s not detected, program exited!' % ', '.join(not_detected_programs))
184
+ exit()
185
+
186
+ # get marker id set
187
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
188
+ marker_seq_list = sorted(glob.glob(marker_seq_re))
189
+
190
+ # define output dir
191
+ renamed_marker_seq_dir = '%s/renamed_markers' % op_dir
192
+ renamed_marker_aln_dir = '%s/renamed_markers_aln' % op_dir
193
+ if trim_with_bmge is False:
194
+ cmds_2_trim_txt = '%s/cmds_2_trimal.txt' % op_dir
195
+ renamed_marker_aln_dir_trimmed = '%s/renamed_markers_aln_trimal' % op_dir
196
+ else:
197
+ cmds_2_trim_txt = '%s/cmds_2_BMGE.txt' % op_dir
198
+ renamed_marker_aln_dir_trimmed = '%s/renamed_markers_aln_BMGE' % op_dir
199
+ concatenated_phy = '%s/concatenated.phy' % op_dir
200
+ concatenated_phy_fasta = '%s/concatenated.phy.fasta' % op_dir
201
+ concatenated_phy_partition = '%s/concatenated_partition.txt' % op_dir
202
+ iqtree_dir = '%s/iqtree_wd' % op_dir
203
+ cmds_1_mafft_txt = '%s/cmds_1_mafft.txt' % op_dir
204
+ cmds_3_iqtree_txt = '%s/cmds_3_iqtree2.txt' % op_dir
205
+ pwd_guide_tree = '%s/iqtree_wd/guide_tree.treefile' % op_dir
206
+
207
+ # create output folder
208
+ if os.path.isdir(op_dir) is True:
209
+ if force_overwrite is True:
210
+ os.system('rm -r %s' % op_dir)
211
+ else:
212
+ print('%s exist, program exited!' % op_dir)
213
+ exit()
214
+ os.mkdir(op_dir)
215
+ os.mkdir(renamed_marker_seq_dir)
216
+ os.mkdir(renamed_marker_aln_dir)
217
+ os.mkdir(renamed_marker_aln_dir_trimmed)
218
+
219
+ # write out best hits and extract sequences
220
+ for marker_seq_file in marker_seq_list:
221
+
222
+ f_path, f_base, f_ext = sep_path_basename_ext(marker_seq_file)
223
+ pwd_renamed_marker_seq = '%s/%s.%s' % (renamed_marker_seq_dir, f_base, marker_seq_ext)
224
+ pwd_renamed_marker_aln = '%s/%s.aln' % (renamed_marker_aln_dir, f_base)
225
+ pwd_renamed_marker_aln_trimmed = '%s/%s.aln' % (renamed_marker_aln_dir_trimmed, f_base)
226
+
227
+ # rename sequences
228
+ marker_hits_seq_renamed_handle = open(pwd_renamed_marker_seq, 'w')
229
+ for each_seq in SeqIO.parse(marker_seq_file, 'fasta'):
230
+ seq_id = each_seq.id
231
+ seq_gnm = '_'.join(seq_id.split('_')[:-1])
232
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
233
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
234
+ marker_hits_seq_renamed_handle.close()
235
+
236
+ # align
237
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_threads, pwd_renamed_marker_seq, pwd_renamed_marker_aln)
238
+
239
+ # trim
240
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (pwd_renamed_marker_aln, pwd_renamed_marker_aln_trimmed)
241
+ if trim_with_bmge is True:
242
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, pwd_renamed_marker_aln, bmge_trim_model, bmge_entropy_score_cutoff, pwd_renamed_marker_aln_trimmed)
243
+
244
+ # write out mafft cmds
245
+ with open(cmds_1_mafft_txt, 'a') as cmds_1_mafft_txt_handle:
246
+ cmds_1_mafft_txt_handle.write(mafft_cmd + '\n')
247
+
248
+ # write out trimal cmds
249
+ with open(cmds_2_trim_txt, 'a') as cmds_2_trim_txt_handle:
250
+ cmds_2_trim_txt_handle.write(trim_cmd + '\n')
251
+
252
+ # run cmds
253
+ os.system(mafft_cmd)
254
+ os.system(trim_cmd)
255
+
256
+ # concatenate alignments
257
+ catfasta2phy(renamed_marker_aln_dir_trimmed, 'aln', concatenated_phy, concatenated_phy_partition)
258
+
259
+ # run iqtree2
260
+ os.mkdir(iqtree_dir)
261
+ get_guide_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/guide_tree -m LG ' % (num_of_threads, concatenated_phy, iqtree_dir, )
262
+ get_c60_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/concatenated -m LG+C60+G+F -ft %s' % (num_of_threads, concatenated_phy, iqtree_dir, pwd_guide_tree)
263
+
264
+ # write out iqtree2 cmds
265
+ with open(cmds_3_iqtree_txt, 'a') as cmds_3_iqtree_txt_handle:
266
+ cmds_3_iqtree_txt_handle.write(get_guide_tree_cmd + '\n')
267
+ cmds_3_iqtree_txt_handle.write(get_c60_tree_cmd + '\n')
268
+
269
+ # run alignment_pruner.pl
270
+ msa_file_list = [concatenated_phy_fasta]
271
+ if alignment_pruner_cutoffs is not None:
272
+ pruner_op_file_list = pruneMSA(concatenated_phy_fasta, alignment_pruner_cutoffs)
273
+ for each_msa in pruner_op_file_list:
274
+ msa_file_list.append(each_msa)
275
+
276
+ # run cmds
277
+ # print('Running iqtree')
278
+ # os.system(get_guide_tree_cmd)
279
+ # os.system(get_c60_tree_cmd)
280
+
281
+ print('You may want to submit the following commands to infer tree')
282
+ print('To be added...')
283
+ print('Done!')
284
+
285
+
286
+ if __name__ == '__main__':
287
+
288
+ MarkerSeq2Tree_parser = argparse.ArgumentParser()
289
+ MarkerSeq2Tree_parser.add_argument('-i', required=True, help='marker seq dir')
290
+ MarkerSeq2Tree_parser.add_argument('-x', required=True, help='marker seq ext')
291
+ MarkerSeq2Tree_parser.add_argument('-o', required=True, help='output dir')
292
+ MarkerSeq2Tree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads')
293
+ MarkerSeq2Tree_parser.add_argument('-bmge', required=False, action="store_true", help='perform BMGE trimming on concatenated MSA')
294
+ MarkerSeq2Tree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
295
+ MarkerSeq2Tree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
296
+ MarkerSeq2Tree_parser.add_argument('-prune', required=False, default=None, help='conservation cutoffs for alignment_pruner.pl')
297
+ MarkerSeq2Tree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
298
+ args = vars(MarkerSeq2Tree_parser.parse_args())
299
+ MarkerSeq2Tree(args)
@@ -0,0 +1,259 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ from distutils.spawn import find_executable
7
+
8
+
9
+ MarkerSeq2Tree_usage = '''
10
+ ================= MarkerSeq2Tree example commands =================
11
+
12
+ Dependencies: mafft, trimal, bmge and iqtree2
13
+
14
+ TreeSAK MarkerSeq2Tree -i best_25 -x fa -o op_dir -t 12 -f -bmge
15
+
16
+ ===================================================================
17
+ '''
18
+
19
+
20
+ def sep_path_basename_ext(file_in):
21
+
22
+ # separate path and file name
23
+ f_path, file_name = os.path.split(file_in)
24
+ if f_path == '':
25
+ f_path = '.'
26
+
27
+ # separate file basename and extension
28
+ f_base, f_ext = os.path.splitext(file_name)
29
+
30
+ return f_path, f_base, f_ext
31
+
32
+
33
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
34
+
35
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
36
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
37
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
38
+ msa_file_list_sorted = sorted(msa_file_list)
39
+
40
+ complete_gnm_set = set()
41
+ for each_msa_file in msa_file_list:
42
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
43
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
44
+ complete_gnm_set.add(each_seq.id)
45
+
46
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
47
+
48
+ # initialize concatenated msa dict
49
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
50
+ msa_len_dict = dict()
51
+ for each_msa_file in msa_file_list_sorted:
52
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
53
+
54
+ # read in msa
55
+ current_msa_len = 0
56
+ current_msa_len_set = set()
57
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
58
+ current_msa_seq_dict = dict()
59
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
60
+ complete_gnm_set.add(each_seq.id)
61
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
62
+ current_msa_len_set.add(len(each_seq.seq))
63
+ current_msa_len = len(each_seq.seq)
64
+
65
+ if len(current_msa_len_set) != 1:
66
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
67
+ exit()
68
+
69
+ msa_len_dict[gene_id] = current_msa_len
70
+
71
+ # add sequence to concatenated msa dict
72
+ for each_gnm in complete_gnm_list_sorted:
73
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
74
+ gnm_to_seq_dict[each_gnm] += msa_seq
75
+
76
+ # write out concatenated msa
77
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
78
+ for each_gnm in complete_gnm_list_sorted:
79
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
80
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
81
+ concatenated_msa_handle.close()
82
+
83
+ # write out partition file
84
+ end_pos = 0
85
+ partition_file_handle = open(partition_file, 'w')
86
+ for each_m in msa_file_list_sorted:
87
+ gene_id = each_m.split('.' + msa_ext)[0]
88
+ current_m_len = msa_len_dict[gene_id]
89
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
90
+ end_pos += current_m_len
91
+ partition_file_handle.close()
92
+
93
+ # convert msa in fasta to phy
94
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
95
+
96
+
97
+ def get_gap_stats(msa_in_fa, stats_txt):
98
+
99
+ gap_pct_dict = dict()
100
+ for each_seq in SeqIO.parse(msa_in_fa, 'fasta'):
101
+ seq_id = each_seq.id
102
+ seq_str = str(each_seq.seq)
103
+ gap_pct = seq_str.count('-')*100/len(seq_str)
104
+ gap_pct = float("{0:.2f}".format(gap_pct))
105
+ gap_pct_dict[seq_id] = gap_pct
106
+
107
+ gap_pct_sorted = sorted(gap_pct_dict.items(), key=lambda x:x[1])
108
+
109
+ stats_txt_handle = open(stats_txt, 'w')
110
+ stats_txt_handle.write('Sequence\tGap\n')
111
+ for each_seq in gap_pct_sorted:
112
+ stats_txt_handle.write('%s\t%s\n' % (each_seq[0], each_seq[1]))
113
+ stats_txt_handle.close()
114
+
115
+
116
+ def BMGE(msa_in, op_prefix, trim_model, entropy_score_cutoff):
117
+
118
+ # define file name
119
+ msa_out_phylip = '%s.BMGE.phylip' % op_prefix
120
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
121
+ msa_out_nexus = '%s.BMGE.nexus' % op_prefix
122
+ msa_out_html = '%s.BMGE.html' % op_prefix
123
+
124
+ # specify path to BMGE.jar
125
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
126
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
127
+
128
+ # run BMGE
129
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -op %s -of %s -on %s -oh %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_phylip, msa_out_fasta, msa_out_nexus, msa_out_html)
130
+ print('Running %s' % bmge_cmd)
131
+ os.system(bmge_cmd)
132
+
133
+
134
+ def MarkerSeq2Tree(args):
135
+
136
+ marker_seq_dir = args['i']
137
+ marker_seq_ext = args['x']
138
+ op_dir = args['o']
139
+ num_of_threads = args['t']
140
+ run_bmge = args['bmge']
141
+ bmge_trim_model = args['bmge_m']
142
+ bmge_entropy_score_cutoff = args['bmge_esc']
143
+ force_overwrite = args['f']
144
+
145
+ # check dependencies
146
+ not_detected_programs = []
147
+ for needed_program in ['mafft-einsi', 'trimal', 'iqtree2']:
148
+ if find_executable(needed_program) is None:
149
+ not_detected_programs.append(needed_program)
150
+ if not_detected_programs != []:
151
+ print('%s not detected, program exited!' % ', '.join(not_detected_programs))
152
+ exit()
153
+
154
+ # get marker id set
155
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
156
+ marker_seq_list = sorted(glob.glob(marker_seq_re))
157
+
158
+ # define output dir
159
+ renamed_marker_seq_dir = '%s/renamed_markers' % op_dir
160
+ renamed_marker_aln_dir = '%s/renamed_markers_aln' % op_dir
161
+ renamed_marker_aln_dir_trimmed = '%s/renamed_markers_aln_trimmed' % op_dir
162
+ concatenated_phy = '%s/concatenated.phy' % op_dir
163
+ concatenated_phy_fasta = '%s/concatenated.phy.fasta' % op_dir
164
+ concatenated_phy_fasta_bmge = '%s/concatenated.BMGE.fasta' % op_dir
165
+ concatenated_phy_partition = '%s/concatenated_partition.txt' % op_dir
166
+ bmge_op_prefix = '%s/concatenated' % op_dir
167
+ iqtree_dir = '%s/iqtree_wd' % op_dir
168
+ cmds_1_mafft_txt = '%s/cmds_1_mafft.txt' % op_dir
169
+ cmds_2_trimal_txt = '%s/cmds_2_trimal.txt' % op_dir
170
+ cmds_3_iqtree_txt = '%s/cmds_3_iqtree2.txt' % op_dir
171
+ pwd_guide_tree = '%s/iqtree_wd/guide_tree.treefile' % op_dir
172
+
173
+ # create output folder
174
+ if os.path.isdir(op_dir) is True:
175
+ if force_overwrite is True:
176
+ os.system('rm -r %s' % op_dir)
177
+ else:
178
+ print('%s exist, program exited!' % op_dir)
179
+ exit()
180
+ os.mkdir(op_dir)
181
+ os.mkdir(renamed_marker_seq_dir)
182
+ os.mkdir(renamed_marker_aln_dir)
183
+ os.mkdir(renamed_marker_aln_dir_trimmed)
184
+
185
+ # write out best hits and extract sequences
186
+ for marker_seq_file in marker_seq_list:
187
+
188
+ f_path, f_base, f_ext = sep_path_basename_ext(marker_seq_file)
189
+ pwd_renamed_marker_seq = '%s/%s.%s' % (renamed_marker_seq_dir, f_base, marker_seq_ext)
190
+ pwd_renamed_marker_aln = '%s/%s.aln' % (renamed_marker_aln_dir, f_base)
191
+ pwd_renamed_marker_aln_trimmed = '%s/%s.aln' % (renamed_marker_aln_dir_trimmed, f_base)
192
+
193
+ # rename sequences
194
+ marker_hits_seq_renamed_handle = open(pwd_renamed_marker_seq, 'w')
195
+ for each_seq in SeqIO.parse(marker_seq_file, 'fasta'):
196
+ seq_id = each_seq.id
197
+ seq_gnm = '_'.join(seq_id.split('_')[:-1])
198
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
199
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
200
+ marker_hits_seq_renamed_handle.close()
201
+
202
+ # align and trim
203
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_threads, pwd_renamed_marker_seq, pwd_renamed_marker_aln)
204
+ trimal_cmd = 'trimal -in %s -out %s -automated1' % (pwd_renamed_marker_aln, pwd_renamed_marker_aln_trimmed)
205
+
206
+ # write out mafft cmds
207
+ with open(cmds_1_mafft_txt, 'a') as cmds_1_mafft_txt_handle:
208
+ cmds_1_mafft_txt_handle.write(mafft_cmd + '\n')
209
+
210
+ # write out trimal cmds
211
+ with open(cmds_2_trimal_txt, 'a') as cmds_2_trimal_txt_handle:
212
+ cmds_2_trimal_txt_handle.write(trimal_cmd + '\n')
213
+
214
+ # run cmds
215
+ os.system(mafft_cmd)
216
+ os.system(trimal_cmd)
217
+
218
+ # concatenate alignments
219
+ catfasta2phy(renamed_marker_aln_dir_trimmed, 'aln', concatenated_phy, concatenated_phy_partition)
220
+
221
+ # run BMGE
222
+ if run_bmge is True:
223
+ BMGE(concatenated_phy_fasta, bmge_op_prefix, bmge_trim_model, bmge_entropy_score_cutoff)
224
+
225
+ msa_to_use = concatenated_phy
226
+ if run_bmge is True:
227
+ msa_to_use = concatenated_phy_fasta_bmge
228
+
229
+ # run iqtree2
230
+ os.mkdir(iqtree_dir)
231
+ get_guide_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/guide_tree -m LG ' % (num_of_threads, msa_to_use, iqtree_dir, )
232
+ get_c60_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/concatenated -m LG+C60+G+F -ft %s' % (num_of_threads, msa_to_use, iqtree_dir, pwd_guide_tree)
233
+
234
+ # write out iqtree2 cmds
235
+ with open(cmds_3_iqtree_txt, 'a') as cmds_3_iqtree_txt_handle:
236
+ cmds_3_iqtree_txt_handle.write(get_guide_tree_cmd + '\n')
237
+ cmds_3_iqtree_txt_handle.write(get_c60_tree_cmd + '\n')
238
+
239
+ # run cmds
240
+ print('Running iqtree')
241
+ os.system(get_guide_tree_cmd)
242
+ os.system(get_c60_tree_cmd)
243
+
244
+ print('Done!')
245
+
246
+
247
+ if __name__ == '__main__':
248
+
249
+ parser = argparse.ArgumentParser()
250
+ parser.add_argument('-i', required=True, help='marker seq dir')
251
+ parser.add_argument('-x', required=True, help='marker seq ext')
252
+ parser.add_argument('-o', required=True, help='output dir')
253
+ parser.add_argument('-t', required=False, type=int, default=1, help='num of threads')
254
+ parser.add_argument('-bmge', required=False, action="store_true", help='perform BMGE trimming on concatenated MSA')
255
+ parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
256
+ parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
257
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
258
+ args = vars(parser.parse_args())
259
+ MarkerSeq2Tree(args)
TreeSAK/ModifyTopo.py ADDED
@@ -0,0 +1,116 @@
1
+ from ete3 import Tree
2
+ from os.path import join, dirname, exists
3
+
4
+
5
+ def get_topology_without_alpha(st, two_nodes_1, two_nodes_2):
6
+
7
+ _st = st.copy()
8
+ n_replaced = _st.get_common_ancestor(two_nodes_1)
9
+
10
+ if len(n_replaced.get_leaf_names()) == 10:
11
+ n_p = n_replaced.up
12
+ n_p_p = n_p.up # All proteobacteria
13
+ n_p.remove_child(n_replaced)
14
+ return _st, n_p_p, n_p
15
+ else:
16
+ beta_gamma = _st.get_common_ancestor(two_nodes_2)
17
+ for _ in n_replaced.children[::]:
18
+ n_replaced.remove_child(_)
19
+ n_replaced.add_child(beta_gamma)
20
+ return _st, all_pro, n_replaced
21
+
22
+
23
+ def read_tree(in_tree, format=None):
24
+
25
+ if isinstance(in_tree, str) and exists(in_tree):
26
+ if format=='auto':
27
+ for f in [0,1,2,3,4,5]:
28
+ try:
29
+ t = Tree(in_tree, format=f)
30
+ return t
31
+ except:
32
+ pass
33
+ else:
34
+ t = Tree(open(in_tree).read(), format=format)
35
+ elif isinstance(in_tree, Tree):
36
+ t = in_tree
37
+ else:
38
+ raise IOError('unknown input')
39
+ return t
40
+
41
+
42
+ def erase_name(in_tree_file, format=0):
43
+
44
+ t = read_tree(in_tree_file, format=format)
45
+ for n in t.traverse():
46
+ if not n.is_leaf():
47
+ n.name = ''
48
+ return t
49
+
50
+
51
+ def get_mito(dataset):
52
+
53
+ euk_tree = Tree(euk_reference_tree, 8)
54
+ euk_tree.prune(dataset)
55
+ et = erase_name(euk_tree)
56
+ return {"Mito": et}
57
+
58
+
59
+ ########################################################################################################################
60
+
61
+ topo_in_txt = ''
62
+
63
+ intree = f'./dating/topology/mixture_models/deno100/phy/deno100.final_TP1.newick'
64
+ euk_reference_tree = '/mnt/home-backup/thliao/AOB/analysis/update_calibrations/mito_dating/phylo/manual_topology/euk.tre'
65
+ euk_list_txt = '/mnt/home-backup/thliao/AOB/analysis/update_calibrations/mito_dating/euk.list'
66
+ base_odir = "./dating/topology"
67
+
68
+ Rickettsiales_lineage = ['GCA_008189685.1', 'GCA_003015145.1']
69
+ Magneto_lineage = ["GCA_000014865.1", "GCA_002109495.1"]
70
+ remaining_alpha = ['GCA_000264455.2', 'GCA_002924445.1']
71
+ Holo = "GCA_000469665.2"
72
+ two_nodes_1 = ['GCA_000014865.1', 'GCA_000264455.2']
73
+ two_nodes_2 = ['GCA_018655245.1', 'GCA_002356115.1']
74
+
75
+ TP_dict = {"TP1": "((((Holo,other_alpha),Rick),Mito),Magneto);",
76
+ "TP2": "(((Holo,other_alpha),(Rick,Mito)),Magneto);",
77
+ "TP3": "((((Holo,Rick),other_alpha),Mito),Magneto);",
78
+ "TP4": "((((Mito,Rick),Holo),other_alpha),Magneto);"}
79
+
80
+ # copy from '/home-user/sswang/project/Mito/results/euk_tree/euk.tre'. The Fig S2A in wang 2021 NC
81
+ # rephrase Porphyra purpurea into Porphyra umbilicalis
82
+ # add Polysphondylium pallidum manually
83
+
84
+ '''
85
+
86
+ 1. provide a input tree
87
+ 2. provide a set of tree skeleton
88
+ 3. use two leaves to determine a clade.
89
+
90
+ '''
91
+
92
+ ########################################################################################################################
93
+
94
+
95
+ for tp_name, TP in TP_dict.items():
96
+
97
+ print('%s\t%s' % (tp_name, TP))
98
+
99
+ # get internal node to tree string dict
100
+
101
+ st = Tree(intree, 8)
102
+ _st, all_pro, n_replaced = get_topology_without_alpha(st.copy(), two_nodes_1, two_nodes_2)
103
+
104
+ mito_usage = [_ for _ in open(euk_list_txt).read().split('\n')]
105
+ m_dict = get_mito(mito_usage)
106
+
107
+ nodes_dict = {"Rick" : st.get_common_ancestor(Rickettsiales_lineage),
108
+ "other_alpha" : st.get_common_ancestor(remaining_alpha),
109
+ "Magneto" : st.get_common_ancestor(Magneto_lineage),
110
+ "Holo" : [_ for _ in st.traverse() if _.name == Holo][0],
111
+ "Mito" : m_dict['Mito']}
112
+
113
+ for k, n in nodes_dict.items():
114
+ TP = TP.replace(k, n.write(format=3).strip(';'))
115
+ n_replaced.add_child(Tree(TP, format=3))
116
+