treesak 1.51.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of treesak might be problematic. Click here for more details.

Files changed (125) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +130 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/CompareMCMC.py +138 -0
  19. TreeSAK/ConcateMSA.py +111 -0
  20. TreeSAK/ConvertMSA.py +135 -0
  21. TreeSAK/Dir.rb +82 -0
  22. TreeSAK/ExtractMarkerSeq.py +263 -0
  23. TreeSAK/FastRoot.py +1175 -0
  24. TreeSAK/FastRoot_backup.py +1122 -0
  25. TreeSAK/FigTree.py +34 -0
  26. TreeSAK/GTDB_tree.py +76 -0
  27. TreeSAK/GeneTree.py +142 -0
  28. TreeSAK/KEGG_Luo17.py +807 -0
  29. TreeSAK/LcaToLeaves.py +66 -0
  30. TreeSAK/MarkerRef2Tree.py +616 -0
  31. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  32. TreeSAK/MarkerSeq2Tree.py +290 -0
  33. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  34. TreeSAK/ModifyTopo.py +116 -0
  35. TreeSAK/Newick_tree_plotter.py +79 -0
  36. TreeSAK/OMA.py +170 -0
  37. TreeSAK/OMA2.py +212 -0
  38. TreeSAK/OneLineAln.py +50 -0
  39. TreeSAK/PB.py +155 -0
  40. TreeSAK/PMSF.py +106 -0
  41. TreeSAK/PhyloBiAssoc.R +84 -0
  42. TreeSAK/PhyloBiAssoc.py +167 -0
  43. TreeSAK/PlotMCMC.py +41 -0
  44. TreeSAK/PlotMcmcNode.py +152 -0
  45. TreeSAK/PlotMcmcNode_old.py +252 -0
  46. TreeSAK/RootTree.py +101 -0
  47. TreeSAK/RootTreeGTDB214.py +288 -0
  48. TreeSAK/RootTreeGTDB220.py +300 -0
  49. TreeSAK/RootTreeGTDB226.py +300 -0
  50. TreeSAK/SequentialDating.py +16 -0
  51. TreeSAK/SingleAleHGT.py +157 -0
  52. TreeSAK/SingleLinePhy.py +50 -0
  53. TreeSAK/SliceMSA.py +142 -0
  54. TreeSAK/SplitScore.py +19 -0
  55. TreeSAK/SplitScore1.py +178 -0
  56. TreeSAK/SplitScore1OMA.py +148 -0
  57. TreeSAK/SplitScore2.py +597 -0
  58. TreeSAK/TaxaCountStats.R +256 -0
  59. TreeSAK/TaxonTree.py +47 -0
  60. TreeSAK/TreeSAK_config.py +32 -0
  61. TreeSAK/VERSION +158 -0
  62. TreeSAK/VisHPD95.R +45 -0
  63. TreeSAK/VisHPD95.py +200 -0
  64. TreeSAK/__init__.py +0 -0
  65. TreeSAK/ale_parser.py +74 -0
  66. TreeSAK/ale_splitter.py +63 -0
  67. TreeSAK/alignment_pruner.pl +1471 -0
  68. TreeSAK/assessOG.py +45 -0
  69. TreeSAK/catfasta2phy.py +140 -0
  70. TreeSAK/cogTree.py +185 -0
  71. TreeSAK/compare_trees.R +30 -0
  72. TreeSAK/compare_trees.py +255 -0
  73. TreeSAK/dating.py +264 -0
  74. TreeSAK/dating_ss.py +361 -0
  75. TreeSAK/deltall.py +82 -0
  76. TreeSAK/do_rrtc.rb +464 -0
  77. TreeSAK/fa2phy.py +42 -0
  78. TreeSAK/format_leaf_name.py +70 -0
  79. TreeSAK/gap_stats.py +38 -0
  80. TreeSAK/get_SCG_tree.py +742 -0
  81. TreeSAK/get_arCOG_seq.py +97 -0
  82. TreeSAK/global_functions.py +222 -0
  83. TreeSAK/gnm_leaves.py +43 -0
  84. TreeSAK/iTOL.py +791 -0
  85. TreeSAK/iTOL_gene_tree.py +80 -0
  86. TreeSAK/itol_msa_stats.py +56 -0
  87. TreeSAK/keep_highest_rrtc.py +37 -0
  88. TreeSAK/koTree.py +194 -0
  89. TreeSAK/label_tree.R +75 -0
  90. TreeSAK/label_tree.py +121 -0
  91. TreeSAK/mad.py +708 -0
  92. TreeSAK/mcmc2tree.py +58 -0
  93. TreeSAK/mcmcTC copy.py +92 -0
  94. TreeSAK/mcmcTC.py +104 -0
  95. TreeSAK/mcmctree_vs_reltime.R +44 -0
  96. TreeSAK/mcmctree_vs_reltime.py +252 -0
  97. TreeSAK/merge_pdf.py +32 -0
  98. TreeSAK/pRTC.py +56 -0
  99. TreeSAK/parse_mcmctree.py +198 -0
  100. TreeSAK/parse_reltime.py +141 -0
  101. TreeSAK/phy2fa.py +37 -0
  102. TreeSAK/plot_distruibution_th.py +165 -0
  103. TreeSAK/prep_mcmctree_ctl.py +92 -0
  104. TreeSAK/print_leaves.py +32 -0
  105. TreeSAK/pruneMSA.py +63 -0
  106. TreeSAK/recode.py +73 -0
  107. TreeSAK/remove_bias.R +112 -0
  108. TreeSAK/rename_leaves.py +77 -0
  109. TreeSAK/replace_clade.py +55 -0
  110. TreeSAK/root_with_out_group.py +84 -0
  111. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  112. TreeSAK/subsample_drep_gnms.py +74 -0
  113. TreeSAK/subset.py +69 -0
  114. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  115. TreeSAK/supertree.py +330 -0
  116. TreeSAK/tmp_1.py +19 -0
  117. TreeSAK/tmp_2.py +19 -0
  118. TreeSAK/tmp_3.py +120 -0
  119. TreeSAK/weighted_rand.rb +23 -0
  120. treesak-1.51.2.data/scripts/TreeSAK +950 -0
  121. treesak-1.51.2.dist-info/LICENSE +674 -0
  122. treesak-1.51.2.dist-info/METADATA +27 -0
  123. treesak-1.51.2.dist-info/RECORD +125 -0
  124. treesak-1.51.2.dist-info/WHEEL +5 -0
  125. treesak-1.51.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,290 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ from distutils.spawn import find_executable
7
+
8
+
9
+ MarkerSeq2Tree_usage = '''
10
+ ======================== MarkerSeq2Tree example commands ========================
11
+
12
+ Dependencies: mafft, trimal, bmge, perl and iqtree2
13
+
14
+ TreeSAK MarkerSeq2Tree -i best_25 -x fa -o op_dir -t 12 -f -bmge -prune 10,20,30
15
+
16
+ =================================================================================
17
+ '''
18
+
19
+
20
+ def sep_path_basename_ext(file_in):
21
+
22
+ f_path, file_name = os.path.split(file_in)
23
+ if f_path == '':
24
+ f_path = '.'
25
+ f_base, f_ext = os.path.splitext(file_name)
26
+
27
+ return f_path, f_base, f_ext
28
+
29
+
30
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
31
+
32
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
33
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
34
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
35
+ msa_file_list_sorted = sorted(msa_file_list)
36
+
37
+ complete_gnm_set = set()
38
+ for each_msa_file in msa_file_list:
39
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
40
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
41
+ complete_gnm_set.add(each_seq.id)
42
+
43
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
44
+
45
+ # initialize concatenated msa dict
46
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
47
+ msa_len_dict = dict()
48
+ for each_msa_file in msa_file_list_sorted:
49
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
50
+
51
+ # read in msa
52
+ current_msa_len = 0
53
+ current_msa_len_set = set()
54
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
55
+ current_msa_seq_dict = dict()
56
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
57
+ complete_gnm_set.add(each_seq.id)
58
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
59
+ current_msa_len_set.add(len(each_seq.seq))
60
+ current_msa_len = len(each_seq.seq)
61
+
62
+ if len(current_msa_len_set) != 1:
63
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
64
+ exit()
65
+
66
+ msa_len_dict[gene_id] = current_msa_len
67
+
68
+ # add sequence to concatenated msa dict
69
+ for each_gnm in complete_gnm_list_sorted:
70
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
71
+ gnm_to_seq_dict[each_gnm] += msa_seq
72
+
73
+ # write out concatenated msa
74
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
75
+ for each_gnm in complete_gnm_list_sorted:
76
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
77
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
78
+ concatenated_msa_handle.close()
79
+
80
+ # write out partition file
81
+ end_pos = 0
82
+ partition_file_handle = open(partition_file, 'w')
83
+ for each_m in msa_file_list_sorted:
84
+ gene_id = each_m.split('.' + msa_ext)[0]
85
+ current_m_len = msa_len_dict[gene_id]
86
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
87
+ end_pos += current_m_len
88
+ partition_file_handle.close()
89
+
90
+ # convert msa in fasta to phy
91
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
92
+
93
+
94
+ def get_gap_stats(msa_in_fa, stats_txt):
95
+
96
+ gap_pct_dict = dict()
97
+ for each_seq in SeqIO.parse(msa_in_fa, 'fasta'):
98
+ seq_id = each_seq.id
99
+ seq_str = str(each_seq.seq)
100
+ gap_pct = seq_str.count('-')*100/len(seq_str)
101
+ gap_pct = float("{0:.2f}".format(gap_pct))
102
+ gap_pct_dict[seq_id] = gap_pct
103
+
104
+ gap_pct_sorted = sorted(gap_pct_dict.items(), key=lambda x:x[1])
105
+
106
+ stats_txt_handle = open(stats_txt, 'w')
107
+ stats_txt_handle.write('Sequence\tGap\n')
108
+ for each_seq in gap_pct_sorted:
109
+ stats_txt_handle.write('%s\t%s\n' % (each_seq[0], each_seq[1]))
110
+ stats_txt_handle.close()
111
+
112
+
113
+ def BMGE(msa_in, op_prefix, trim_model, entropy_score_cutoff):
114
+
115
+ # define file name
116
+ msa_out_phylip = '%s.BMGE.phylip' % op_prefix
117
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
118
+ msa_out_nexus = '%s.BMGE.nexus' % op_prefix
119
+ msa_out_html = '%s.BMGE.html' % op_prefix
120
+
121
+ # specify path to BMGE.jar
122
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
123
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
124
+
125
+ # run BMGE
126
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -op %s -of %s -on %s -oh %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_phylip, msa_out_fasta, msa_out_nexus, msa_out_html)
127
+ print('Running %s' % bmge_cmd)
128
+ os.system(bmge_cmd)
129
+
130
+
131
+ def pruneMSA(msa_in, conserved_cutoffs):
132
+
133
+ msa_path, msa_base, msa_ext = sep_path_basename_ext(msa_in)
134
+
135
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
136
+ alignment_pruner_pl = '%s/alignment_pruner.pl' % current_file_path
137
+ cutoff_list = conserved_cutoffs.split(',')
138
+
139
+ op_file_list = []
140
+ for each_cutoff in cutoff_list:
141
+ cutoff_formatted = str(float(each_cutoff)/100).replace('0.', '.')
142
+ current_msa_out = '%s/%s.pruner%s%s' % (msa_path, msa_base, each_cutoff, msa_ext)
143
+ perl_cmd = 'perl %s --file %s --conserved_threshold %s > %s' % (alignment_pruner_pl, msa_in, cutoff_formatted, current_msa_out)
144
+ perl_cmd_for_report = 'perl %s --file %s --conserved_threshold %s > %s' % ('alignment_pruner.pl', msa_in, cutoff_formatted, current_msa_out)
145
+ op_file_list.append(current_msa_out)
146
+ print(perl_cmd_for_report)
147
+ os.system(perl_cmd)
148
+
149
+ # report
150
+ print('Pruned MSA exported to:')
151
+ print('\n'.join(op_file_list))
152
+
153
+
154
+ def MarkerSeq2Tree(args):
155
+
156
+ marker_seq_dir = args['i']
157
+ marker_seq_ext = args['x']
158
+ op_dir = args['o']
159
+ num_of_threads = args['t']
160
+ run_bmge = args['bmge']
161
+ trim_with_bmge = args['bmge']
162
+ bmge_trim_model = args['bmge_m']
163
+ bmge_entropy_score_cutoff = args['bmge_esc']
164
+ force_overwrite = args['f']
165
+ alignment_pruner_cutoffs = args['prune']
166
+
167
+ # specify path to BMGE.jar
168
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
169
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
170
+
171
+ # check dependencies
172
+ not_detected_programs = []
173
+ for needed_program in ['mafft-einsi', 'trimal', 'iqtree2']:
174
+ if find_executable(needed_program) is None:
175
+ not_detected_programs.append(needed_program)
176
+ if not_detected_programs != []:
177
+ print('%s not detected, program exited!' % ', '.join(not_detected_programs))
178
+ exit()
179
+
180
+ # get marker id set
181
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
182
+ marker_seq_list = sorted(glob.glob(marker_seq_re))
183
+
184
+ # define output dir
185
+ renamed_marker_seq_dir = '%s/renamed_markers' % op_dir
186
+ renamed_marker_aln_dir = '%s/renamed_markers_aln' % op_dir
187
+ if trim_with_bmge is False:
188
+ cmds_2_trim_txt = '%s/cmds_2_trimal.txt' % op_dir
189
+ renamed_marker_aln_dir_trimmed = '%s/renamed_markers_aln_trimal' % op_dir
190
+ else:
191
+ cmds_2_trim_txt = '%s/cmds_2_BMGE.txt' % op_dir
192
+ renamed_marker_aln_dir_trimmed = '%s/renamed_markers_aln_BMGE' % op_dir
193
+ concatenated_phy = '%s/concatenated.phy' % op_dir
194
+ concatenated_phy_fasta = '%s/concatenated.phy.fasta' % op_dir
195
+ concatenated_phy_partition = '%s/concatenated_partition.txt' % op_dir
196
+ iqtree_dir = '%s/iqtree_wd' % op_dir
197
+ cmds_1_mafft_txt = '%s/cmds_1_mafft.txt' % op_dir
198
+ cmds_3_iqtree_txt = '%s/cmds_3_iqtree2.txt' % op_dir
199
+ pwd_guide_tree = '%s/iqtree_wd/guide_tree.treefile' % op_dir
200
+
201
+ # create output folder
202
+ if os.path.isdir(op_dir) is True:
203
+ if force_overwrite is True:
204
+ os.system('rm -r %s' % op_dir)
205
+ else:
206
+ print('%s exist, program exited!' % op_dir)
207
+ exit()
208
+ os.mkdir(op_dir)
209
+ os.mkdir(renamed_marker_seq_dir)
210
+ os.mkdir(renamed_marker_aln_dir)
211
+ os.mkdir(renamed_marker_aln_dir_trimmed)
212
+
213
+ # write out best hits and extract sequences
214
+ for marker_seq_file in marker_seq_list:
215
+
216
+ f_path, f_base, f_ext = sep_path_basename_ext(marker_seq_file)
217
+ pwd_renamed_marker_seq = '%s/%s.%s' % (renamed_marker_seq_dir, f_base, marker_seq_ext)
218
+ pwd_renamed_marker_aln = '%s/%s.aln' % (renamed_marker_aln_dir, f_base)
219
+ pwd_renamed_marker_aln_trimmed = '%s/%s.aln' % (renamed_marker_aln_dir_trimmed, f_base)
220
+
221
+ # rename sequences
222
+ marker_hits_seq_renamed_handle = open(pwd_renamed_marker_seq, 'w')
223
+ for each_seq in SeqIO.parse(marker_seq_file, 'fasta'):
224
+ seq_id = each_seq.id
225
+ seq_gnm = '_'.join(seq_id.split('_')[:-1])
226
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
227
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
228
+ marker_hits_seq_renamed_handle.close()
229
+
230
+ # align
231
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_threads, pwd_renamed_marker_seq, pwd_renamed_marker_aln)
232
+
233
+ # trim
234
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (pwd_renamed_marker_aln, pwd_renamed_marker_aln_trimmed)
235
+ if trim_with_bmge is True:
236
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, pwd_renamed_marker_aln, bmge_trim_model, bmge_entropy_score_cutoff, pwd_renamed_marker_aln_trimmed)
237
+
238
+ # write out mafft cmds
239
+ with open(cmds_1_mafft_txt, 'a') as cmds_1_mafft_txt_handle:
240
+ cmds_1_mafft_txt_handle.write(mafft_cmd + '\n')
241
+
242
+ # write out trimal cmds
243
+ with open(cmds_2_trim_txt, 'a') as cmds_2_trim_txt_handle:
244
+ cmds_2_trim_txt_handle.write(trim_cmd + '\n')
245
+
246
+ # run cmds
247
+ os.system(mafft_cmd)
248
+ os.system(trim_cmd)
249
+
250
+ # concatenate alignments
251
+ catfasta2phy(renamed_marker_aln_dir_trimmed, 'aln', concatenated_phy, concatenated_phy_partition)
252
+
253
+ # run iqtree2
254
+ os.mkdir(iqtree_dir)
255
+ get_guide_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/guide_tree -m LG ' % (num_of_threads, concatenated_phy, iqtree_dir, )
256
+ get_c60_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/concatenated -m LG+C60+G+F -ft %s' % (num_of_threads, concatenated_phy, iqtree_dir, pwd_guide_tree)
257
+
258
+ # write out iqtree2 cmds
259
+ with open(cmds_3_iqtree_txt, 'a') as cmds_3_iqtree_txt_handle:
260
+ cmds_3_iqtree_txt_handle.write(get_guide_tree_cmd + '\n')
261
+ cmds_3_iqtree_txt_handle.write(get_c60_tree_cmd + '\n')
262
+
263
+ # run alignment_pruner.pl
264
+ if alignment_pruner_cutoffs is not None:
265
+ pruneMSA(concatenated_phy_fasta, alignment_pruner_cutoffs)
266
+
267
+ # run cmds
268
+ print('Running iqtree')
269
+ # os.system(get_guide_tree_cmd)
270
+ # os.system(get_c60_tree_cmd)
271
+
272
+ print('You may want to submit the following commands to infer tree')
273
+ print('To be added...')
274
+ print('Done!')
275
+
276
+
277
+ if __name__ == '__main__':
278
+
279
+ MarkerSeq2Tree_parser = argparse.ArgumentParser()
280
+ MarkerSeq2Tree_parser.add_argument('-i', required=True, help='marker seq dir')
281
+ MarkerSeq2Tree_parser.add_argument('-x', required=True, help='marker seq ext')
282
+ MarkerSeq2Tree_parser.add_argument('-o', required=True, help='output dir')
283
+ MarkerSeq2Tree_parser.add_argument('-t', required=False, type=int, default=1, help='num of threads')
284
+ MarkerSeq2Tree_parser.add_argument('-bmge', required=False, action="store_true", help='perform BMGE trimming on concatenated MSA')
285
+ MarkerSeq2Tree_parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
286
+ MarkerSeq2Tree_parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
287
+ MarkerSeq2Tree_parser.add_argument('-prune', required=False, default=None, help='conservation cutoffs for alignment_pruner.pl')
288
+ MarkerSeq2Tree_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
289
+ args = vars(MarkerSeq2Tree_parser.parse_args())
290
+ MarkerSeq2Tree(args)
@@ -0,0 +1,259 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ from distutils.spawn import find_executable
7
+
8
+
9
+ MarkerSeq2Tree_usage = '''
10
+ ================= MarkerSeq2Tree example commands =================
11
+
12
+ Dependencies: mafft, trimal, bmge and iqtree2
13
+
14
+ TreeSAK MarkerSeq2Tree -i best_25 -x fa -o op_dir -t 12 -f -bmge
15
+
16
+ ===================================================================
17
+ '''
18
+
19
+
20
+ def sep_path_basename_ext(file_in):
21
+
22
+ # separate path and file name
23
+ f_path, file_name = os.path.split(file_in)
24
+ if f_path == '':
25
+ f_path = '.'
26
+
27
+ # separate file basename and extension
28
+ f_base, f_ext = os.path.splitext(file_name)
29
+
30
+ return f_path, f_base, f_ext
31
+
32
+
33
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
34
+
35
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
36
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
37
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
38
+ msa_file_list_sorted = sorted(msa_file_list)
39
+
40
+ complete_gnm_set = set()
41
+ for each_msa_file in msa_file_list:
42
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
43
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
44
+ complete_gnm_set.add(each_seq.id)
45
+
46
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
47
+
48
+ # initialize concatenated msa dict
49
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
50
+ msa_len_dict = dict()
51
+ for each_msa_file in msa_file_list_sorted:
52
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
53
+
54
+ # read in msa
55
+ current_msa_len = 0
56
+ current_msa_len_set = set()
57
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
58
+ current_msa_seq_dict = dict()
59
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
60
+ complete_gnm_set.add(each_seq.id)
61
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
62
+ current_msa_len_set.add(len(each_seq.seq))
63
+ current_msa_len = len(each_seq.seq)
64
+
65
+ if len(current_msa_len_set) != 1:
66
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
67
+ exit()
68
+
69
+ msa_len_dict[gene_id] = current_msa_len
70
+
71
+ # add sequence to concatenated msa dict
72
+ for each_gnm in complete_gnm_list_sorted:
73
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
74
+ gnm_to_seq_dict[each_gnm] += msa_seq
75
+
76
+ # write out concatenated msa
77
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
78
+ for each_gnm in complete_gnm_list_sorted:
79
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
80
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
81
+ concatenated_msa_handle.close()
82
+
83
+ # write out partition file
84
+ end_pos = 0
85
+ partition_file_handle = open(partition_file, 'w')
86
+ for each_m in msa_file_list_sorted:
87
+ gene_id = each_m.split('.' + msa_ext)[0]
88
+ current_m_len = msa_len_dict[gene_id]
89
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
90
+ end_pos += current_m_len
91
+ partition_file_handle.close()
92
+
93
+ # convert msa in fasta to phy
94
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
95
+
96
+
97
+ def get_gap_stats(msa_in_fa, stats_txt):
98
+
99
+ gap_pct_dict = dict()
100
+ for each_seq in SeqIO.parse(msa_in_fa, 'fasta'):
101
+ seq_id = each_seq.id
102
+ seq_str = str(each_seq.seq)
103
+ gap_pct = seq_str.count('-')*100/len(seq_str)
104
+ gap_pct = float("{0:.2f}".format(gap_pct))
105
+ gap_pct_dict[seq_id] = gap_pct
106
+
107
+ gap_pct_sorted = sorted(gap_pct_dict.items(), key=lambda x:x[1])
108
+
109
+ stats_txt_handle = open(stats_txt, 'w')
110
+ stats_txt_handle.write('Sequence\tGap\n')
111
+ for each_seq in gap_pct_sorted:
112
+ stats_txt_handle.write('%s\t%s\n' % (each_seq[0], each_seq[1]))
113
+ stats_txt_handle.close()
114
+
115
+
116
+ def BMGE(msa_in, op_prefix, trim_model, entropy_score_cutoff):
117
+
118
+ # define file name
119
+ msa_out_phylip = '%s.BMGE.phylip' % op_prefix
120
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
121
+ msa_out_nexus = '%s.BMGE.nexus' % op_prefix
122
+ msa_out_html = '%s.BMGE.html' % op_prefix
123
+
124
+ # specify path to BMGE.jar
125
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
126
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
127
+
128
+ # run BMGE
129
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -op %s -of %s -on %s -oh %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_phylip, msa_out_fasta, msa_out_nexus, msa_out_html)
130
+ print('Running %s' % bmge_cmd)
131
+ os.system(bmge_cmd)
132
+
133
+
134
+ def MarkerSeq2Tree(args):
135
+
136
+ marker_seq_dir = args['i']
137
+ marker_seq_ext = args['x']
138
+ op_dir = args['o']
139
+ num_of_threads = args['t']
140
+ run_bmge = args['bmge']
141
+ bmge_trim_model = args['bmge_m']
142
+ bmge_entropy_score_cutoff = args['bmge_esc']
143
+ force_overwrite = args['f']
144
+
145
+ # check dependencies
146
+ not_detected_programs = []
147
+ for needed_program in ['mafft-einsi', 'trimal', 'iqtree2']:
148
+ if find_executable(needed_program) is None:
149
+ not_detected_programs.append(needed_program)
150
+ if not_detected_programs != []:
151
+ print('%s not detected, program exited!' % ', '.join(not_detected_programs))
152
+ exit()
153
+
154
+ # get marker id set
155
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
156
+ marker_seq_list = sorted(glob.glob(marker_seq_re))
157
+
158
+ # define output dir
159
+ renamed_marker_seq_dir = '%s/renamed_markers' % op_dir
160
+ renamed_marker_aln_dir = '%s/renamed_markers_aln' % op_dir
161
+ renamed_marker_aln_dir_trimmed = '%s/renamed_markers_aln_trimmed' % op_dir
162
+ concatenated_phy = '%s/concatenated.phy' % op_dir
163
+ concatenated_phy_fasta = '%s/concatenated.phy.fasta' % op_dir
164
+ concatenated_phy_fasta_bmge = '%s/concatenated.BMGE.fasta' % op_dir
165
+ concatenated_phy_partition = '%s/concatenated_partition.txt' % op_dir
166
+ bmge_op_prefix = '%s/concatenated' % op_dir
167
+ iqtree_dir = '%s/iqtree_wd' % op_dir
168
+ cmds_1_mafft_txt = '%s/cmds_1_mafft.txt' % op_dir
169
+ cmds_2_trimal_txt = '%s/cmds_2_trimal.txt' % op_dir
170
+ cmds_3_iqtree_txt = '%s/cmds_3_iqtree2.txt' % op_dir
171
+ pwd_guide_tree = '%s/iqtree_wd/guide_tree.treefile' % op_dir
172
+
173
+ # create output folder
174
+ if os.path.isdir(op_dir) is True:
175
+ if force_overwrite is True:
176
+ os.system('rm -r %s' % op_dir)
177
+ else:
178
+ print('%s exist, program exited!' % op_dir)
179
+ exit()
180
+ os.mkdir(op_dir)
181
+ os.mkdir(renamed_marker_seq_dir)
182
+ os.mkdir(renamed_marker_aln_dir)
183
+ os.mkdir(renamed_marker_aln_dir_trimmed)
184
+
185
+ # write out best hits and extract sequences
186
+ for marker_seq_file in marker_seq_list:
187
+
188
+ f_path, f_base, f_ext = sep_path_basename_ext(marker_seq_file)
189
+ pwd_renamed_marker_seq = '%s/%s.%s' % (renamed_marker_seq_dir, f_base, marker_seq_ext)
190
+ pwd_renamed_marker_aln = '%s/%s.aln' % (renamed_marker_aln_dir, f_base)
191
+ pwd_renamed_marker_aln_trimmed = '%s/%s.aln' % (renamed_marker_aln_dir_trimmed, f_base)
192
+
193
+ # rename sequences
194
+ marker_hits_seq_renamed_handle = open(pwd_renamed_marker_seq, 'w')
195
+ for each_seq in SeqIO.parse(marker_seq_file, 'fasta'):
196
+ seq_id = each_seq.id
197
+ seq_gnm = '_'.join(seq_id.split('_')[:-1])
198
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
199
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
200
+ marker_hits_seq_renamed_handle.close()
201
+
202
+ # align and trim
203
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_threads, pwd_renamed_marker_seq, pwd_renamed_marker_aln)
204
+ trimal_cmd = 'trimal -in %s -out %s -automated1' % (pwd_renamed_marker_aln, pwd_renamed_marker_aln_trimmed)
205
+
206
+ # write out mafft cmds
207
+ with open(cmds_1_mafft_txt, 'a') as cmds_1_mafft_txt_handle:
208
+ cmds_1_mafft_txt_handle.write(mafft_cmd + '\n')
209
+
210
+ # write out trimal cmds
211
+ with open(cmds_2_trimal_txt, 'a') as cmds_2_trimal_txt_handle:
212
+ cmds_2_trimal_txt_handle.write(trimal_cmd + '\n')
213
+
214
+ # run cmds
215
+ os.system(mafft_cmd)
216
+ os.system(trimal_cmd)
217
+
218
+ # concatenate alignments
219
+ catfasta2phy(renamed_marker_aln_dir_trimmed, 'aln', concatenated_phy, concatenated_phy_partition)
220
+
221
+ # run BMGE
222
+ if run_bmge is True:
223
+ BMGE(concatenated_phy_fasta, bmge_op_prefix, bmge_trim_model, bmge_entropy_score_cutoff)
224
+
225
+ msa_to_use = concatenated_phy
226
+ if run_bmge is True:
227
+ msa_to_use = concatenated_phy_fasta_bmge
228
+
229
+ # run iqtree2
230
+ os.mkdir(iqtree_dir)
231
+ get_guide_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/guide_tree -m LG ' % (num_of_threads, msa_to_use, iqtree_dir, )
232
+ get_c60_tree_cmd = 'iqtree2 --seqtype AA -T %s -B 1000 --alrt 1000 --quiet -s %s --prefix %s/concatenated -m LG+C60+G+F -ft %s' % (num_of_threads, msa_to_use, iqtree_dir, pwd_guide_tree)
233
+
234
+ # write out iqtree2 cmds
235
+ with open(cmds_3_iqtree_txt, 'a') as cmds_3_iqtree_txt_handle:
236
+ cmds_3_iqtree_txt_handle.write(get_guide_tree_cmd + '\n')
237
+ cmds_3_iqtree_txt_handle.write(get_c60_tree_cmd + '\n')
238
+
239
+ # run cmds
240
+ print('Running iqtree')
241
+ os.system(get_guide_tree_cmd)
242
+ os.system(get_c60_tree_cmd)
243
+
244
+ print('Done!')
245
+
246
+
247
+ if __name__ == '__main__':
248
+
249
+ parser = argparse.ArgumentParser()
250
+ parser.add_argument('-i', required=True, help='marker seq dir')
251
+ parser.add_argument('-x', required=True, help='marker seq ext')
252
+ parser.add_argument('-o', required=True, help='output dir')
253
+ parser.add_argument('-t', required=False, type=int, default=1, help='num of threads')
254
+ parser.add_argument('-bmge', required=False, action="store_true", help='perform BMGE trimming on concatenated MSA')
255
+ parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
256
+ parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
257
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
258
+ args = vars(parser.parse_args())
259
+ MarkerSeq2Tree(args)
TreeSAK/ModifyTopo.py ADDED
@@ -0,0 +1,116 @@
1
+ from ete3 import Tree
2
+ from os.path import join, dirname, exists
3
+
4
+
5
+ def get_topology_without_alpha(st, two_nodes_1, two_nodes_2):
6
+
7
+ _st = st.copy()
8
+ n_replaced = _st.get_common_ancestor(two_nodes_1)
9
+
10
+ if len(n_replaced.get_leaf_names()) == 10:
11
+ n_p = n_replaced.up
12
+ n_p_p = n_p.up # All proteobacteria
13
+ n_p.remove_child(n_replaced)
14
+ return _st, n_p_p, n_p
15
+ else:
16
+ beta_gamma = _st.get_common_ancestor(two_nodes_2)
17
+ for _ in n_replaced.children[::]:
18
+ n_replaced.remove_child(_)
19
+ n_replaced.add_child(beta_gamma)
20
+ return _st, all_pro, n_replaced
21
+
22
+
23
+ def read_tree(in_tree, format=None):
24
+
25
+ if isinstance(in_tree, str) and exists(in_tree):
26
+ if format=='auto':
27
+ for f in [0,1,2,3,4,5]:
28
+ try:
29
+ t = Tree(in_tree, format=f)
30
+ return t
31
+ except:
32
+ pass
33
+ else:
34
+ t = Tree(open(in_tree).read(), format=format)
35
+ elif isinstance(in_tree, Tree):
36
+ t = in_tree
37
+ else:
38
+ raise IOError('unknown input')
39
+ return t
40
+
41
+
42
+ def erase_name(in_tree_file, format=0):
43
+
44
+ t = read_tree(in_tree_file, format=format)
45
+ for n in t.traverse():
46
+ if not n.is_leaf():
47
+ n.name = ''
48
+ return t
49
+
50
+
51
+ def get_mito(dataset):
52
+
53
+ euk_tree = Tree(euk_reference_tree, 8)
54
+ euk_tree.prune(dataset)
55
+ et = erase_name(euk_tree)
56
+ return {"Mito": et}
57
+
58
+
59
+ ########################################################################################################################
60
+
61
+ topo_in_txt = ''
62
+
63
+ intree = f'./dating/topology/mixture_models/deno100/phy/deno100.final_TP1.newick'
64
+ euk_reference_tree = '/mnt/home-backup/thliao/AOB/analysis/update_calibrations/mito_dating/phylo/manual_topology/euk.tre'
65
+ euk_list_txt = '/mnt/home-backup/thliao/AOB/analysis/update_calibrations/mito_dating/euk.list'
66
+ base_odir = "./dating/topology"
67
+
68
+ Rickettsiales_lineage = ['GCA_008189685.1', 'GCA_003015145.1']
69
+ Magneto_lineage = ["GCA_000014865.1", "GCA_002109495.1"]
70
+ remaining_alpha = ['GCA_000264455.2', 'GCA_002924445.1']
71
+ Holo = "GCA_000469665.2"
72
+ two_nodes_1 = ['GCA_000014865.1', 'GCA_000264455.2']
73
+ two_nodes_2 = ['GCA_018655245.1', 'GCA_002356115.1']
74
+
75
+ TP_dict = {"TP1": "((((Holo,other_alpha),Rick),Mito),Magneto);",
76
+ "TP2": "(((Holo,other_alpha),(Rick,Mito)),Magneto);",
77
+ "TP3": "((((Holo,Rick),other_alpha),Mito),Magneto);",
78
+ "TP4": "((((Mito,Rick),Holo),other_alpha),Magneto);"}
79
+
80
+ # copy from '/home-user/sswang/project/Mito/results/euk_tree/euk.tre'. The Fig S2A in wang 2021 NC
81
+ # rephrase Porphyra purpurea into Porphyra umbilicalis
82
+ # add Polysphondylium pallidum manually
83
+
84
+ '''
85
+
86
+ 1. provide a input tree
87
+ 2. provide a set of tree skeleton
88
+ 3. use two leaves to determine a clade.
89
+
90
+ '''
91
+
92
+ ########################################################################################################################
93
+
94
+
95
+ for tp_name, TP in TP_dict.items():
96
+
97
+ print('%s\t%s' % (tp_name, TP))
98
+
99
+ # get internal node to tree string dict
100
+
101
+ st = Tree(intree, 8)
102
+ _st, all_pro, n_replaced = get_topology_without_alpha(st.copy(), two_nodes_1, two_nodes_2)
103
+
104
+ mito_usage = [_ for _ in open(euk_list_txt).read().split('\n')]
105
+ m_dict = get_mito(mito_usage)
106
+
107
+ nodes_dict = {"Rick" : st.get_common_ancestor(Rickettsiales_lineage),
108
+ "other_alpha" : st.get_common_ancestor(remaining_alpha),
109
+ "Magneto" : st.get_common_ancestor(Magneto_lineage),
110
+ "Holo" : [_ for _ in st.traverse() if _.name == Holo][0],
111
+ "Mito" : m_dict['Mito']}
112
+
113
+ for k, n in nodes_dict.items():
114
+ TP = TP.replace(k, n.write(format=3).strip(';'))
115
+ n_replaced.add_child(Tree(TP, format=3))
116
+