treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/LcaToLeaves.py ADDED
@@ -0,0 +1,66 @@
1
+ import os
2
+ import argparse
3
+ from ete3 import Tree
4
+
5
+
6
+ LcaToLeaves_usage = '''
7
+ ==================== LcaToLeaves example commands ====================
8
+
9
+ BioSAK LcaToLeaves -s species_tree.ale.stree -n 123
10
+ BioSAK LcaToLeaves -s species_tree.ale.stree -n internal_nodes.txt
11
+
12
+ ======================================================================
13
+ '''
14
+
15
+
16
+ def lca_to_two_leaves(species_tree_from_ale, internal_node_id):
17
+
18
+ # read in ale species tree
19
+ stree_ale = Tree(species_tree_from_ale, format=1)
20
+
21
+ # get all leaves of the internal node
22
+ internal_node = stree_ale.search_nodes(name=internal_node_id)[0]
23
+ internal_node_leaf_object = internal_node.get_leaves()
24
+ internal_node_leaf_set = set()
25
+ for each_leaf in internal_node_leaf_object:
26
+ internal_node_leaf_set.add(each_leaf.name)
27
+
28
+ # get the two leaves needed
29
+ targeted_two_leaves = []
30
+ leaves_found = False
31
+ for leaf_1 in internal_node_leaf_set:
32
+ for leaf_2 in internal_node_leaf_set:
33
+ if leaf_1 != leaf_2:
34
+ if leaves_found is False:
35
+ current_lca_id = stree_ale.get_common_ancestor(leaf_1, leaf_2).name
36
+ if current_lca_id == internal_node_id:
37
+ targeted_two_leaves.append(leaf_1)
38
+ targeted_two_leaves.append(leaf_2)
39
+ leaves_found = True
40
+
41
+ return targeted_two_leaves[0], targeted_two_leaves[1]
42
+
43
+
44
+ def LcaToLeaves(args):
45
+
46
+ species_tree_from_ale = args['s']
47
+ internal_node = args['n']
48
+
49
+ internal_node_set = set()
50
+ if os.path.isfile(internal_node) is False:
51
+ internal_node_set.add(internal_node)
52
+ else:
53
+ for each_node in open(internal_node):
54
+ internal_node_set.add(each_node.strip())
55
+
56
+ for each_internal_node in internal_node_set:
57
+ leaf_1, leaf_2 = lca_to_two_leaves(species_tree_from_ale, each_internal_node)
58
+ print('%s\t%s\t%s' % (each_internal_node, leaf_1, leaf_2))
59
+
60
+ if __name__ == '__main__':
61
+
62
+ parser = argparse.ArgumentParser()
63
+ parser.add_argument('-s', required=True, help='the .stree file from ALE')
64
+ parser.add_argument('-n', required=True, help='internal node(s)')
65
+ args = vars(parser.parse_args())
66
+ LcaToLeaves(args)
@@ -0,0 +1,616 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ import multiprocessing as mp
7
+ from distutils.spawn import find_executable
8
+
9
+
10
+ MarkerRef2Tree_usage = '''
11
+ ============================= MarkerRef2Tree example commands =============================
12
+
13
+ Dependencies: java, blastp, mafft-einsi, trimal, iqtree2
14
+
15
+ # example commands
16
+ TreeSAK MarkerRef2Tree -i faa_files -x faa -m marker_seq -mx fa -o output_dir -bmge -e 10 -t 6 -c 85
17
+ TreeSAK MarkerRef2Tree -i faa_files -x faa -m marker_seq -mx fa -o output_dir -bmge -e 10 -t 6 -c 75,100
18
+
19
+ # file extension need to be faa
20
+
21
+ ===========================================================================================
22
+ '''
23
+
24
+
25
+ def check_dependencies(program_list):
26
+
27
+ # check whether executables exist
28
+ not_detected_programs = []
29
+ for needed_program in program_list:
30
+ if find_executable(needed_program) is None:
31
+ not_detected_programs.append(needed_program)
32
+
33
+ # report
34
+ if not_detected_programs != []:
35
+ print('%s not found, program exited!' % ','.join(not_detected_programs))
36
+ exit()
37
+
38
+
39
+ def exe_cmds(cmd_list, num_threads):
40
+ print('Running %s commands with %s cores' % (len(cmd_list), num_threads))
41
+ pool = mp.Pool(processes=num_threads)
42
+ pool.map(os.system, cmd_list)
43
+ pool.close()
44
+ pool.join()
45
+
46
+
47
+ def sep_path_basename_ext(file_in):
48
+ file_path, file_name = os.path.split(file_in)
49
+ if file_path == '':
50
+ file_path = '.'
51
+ file_basename, file_extension = os.path.splitext(file_name)
52
+ return file_path, file_basename, file_extension
53
+
54
+
55
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, concatenated_msa_fasta, partition_file):
56
+
57
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
58
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
59
+ msa_file_list_sorted = sorted(msa_file_list)
60
+
61
+ complete_gnm_set = set()
62
+ for each_msa_file in msa_file_list:
63
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
64
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
65
+ complete_gnm_set.add(each_seq.id)
66
+
67
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
68
+
69
+ # initialize concatenated msa dict
70
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
71
+ msa_len_dict = dict()
72
+ for each_msa_file in msa_file_list_sorted:
73
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
74
+
75
+ # read in msa
76
+ current_msa_len = 0
77
+ current_msa_len_set = set()
78
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
79
+ current_msa_seq_dict = dict()
80
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
81
+ complete_gnm_set.add(each_seq.id)
82
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
83
+ current_msa_len_set.add(len(each_seq.seq))
84
+ current_msa_len = len(each_seq.seq)
85
+
86
+ if len(current_msa_len_set) != 1:
87
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
88
+ exit()
89
+
90
+ msa_len_dict[gene_id] = current_msa_len
91
+
92
+ # add sequence to concatenated msa dict
93
+ for each_gnm in complete_gnm_list_sorted:
94
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
95
+ gnm_to_seq_dict[each_gnm] += msa_seq
96
+
97
+ # write out concatenated msa
98
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
99
+ for each_gnm in complete_gnm_list_sorted:
100
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
101
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
102
+ concatenated_msa_handle.close()
103
+
104
+ # write out partition file
105
+ end_pos = 0
106
+ partition_file_handle = open(partition_file, 'w')
107
+ for each_m in msa_file_list_sorted:
108
+ gene_id = each_m.split('.' + msa_ext)[0]
109
+ current_m_len = msa_len_dict[gene_id]
110
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
111
+ end_pos += current_m_len
112
+ partition_file_handle.close()
113
+
114
+ # convert fasta to phy
115
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
116
+
117
+
118
+ def select_seq(seq_file, id_file,select_option, output_file, one_line, in_fastq):
119
+
120
+ # get provided id list
121
+ seq_id_list = set()
122
+ for seq_id in open(id_file):
123
+ seq_id_list.add(seq_id.strip())
124
+
125
+ seq_in_format = 'fasta'
126
+ if in_fastq is True:
127
+ seq_in_format = 'fastq'
128
+
129
+ # extract sequences
130
+ output_file_handle = open(output_file, 'w')
131
+ for seq_record in SeqIO.parse(seq_file, seq_in_format):
132
+ seq_id = seq_record.id
133
+ if select_option == 1:
134
+ if seq_id in seq_id_list:
135
+
136
+ if in_fastq is False:
137
+ if one_line is False:
138
+ SeqIO.write(seq_record, output_file_handle, 'fasta')
139
+ else:
140
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
141
+ else:
142
+ SeqIO.write(seq_record, output_file_handle, 'fastq')
143
+
144
+ if select_option == 0:
145
+ if seq_id not in seq_id_list:
146
+
147
+ if in_fastq is False:
148
+ if one_line is False:
149
+ SeqIO.write(seq_record, output_file_handle, 'fasta')
150
+ else:
151
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
152
+ else:
153
+ SeqIO.write(seq_record, output_file_handle, 'fastq')
154
+ output_file_handle.close()
155
+
156
+
157
+ def AssessMarkerPA(trimmed_aln_dir, gnm_set, group_to_gnm_dict, present_pct_cutoff_list, op_dir):
158
+
159
+ group_to_gnm_num_dict = dict()
160
+ gnm_to_group_dict = dict()
161
+ for each_g in group_to_gnm_dict:
162
+ gnm_member_list = group_to_gnm_dict[each_g]
163
+ group_to_gnm_num_dict[each_g] = len(gnm_member_list)
164
+ for each_gnm in gnm_member_list:
165
+ gnm_to_group_dict[each_gnm] = each_g
166
+ group_id_list_sorted = sorted(list(group_to_gnm_dict.keys()))
167
+
168
+ # exit program if group information is missing
169
+ gnms_without_group_info = set()
170
+ for gnm in gnm_set:
171
+ if gnm not in gnm_to_group_dict:
172
+ gnms_without_group_info.add(gnm)
173
+
174
+ if len(gnms_without_group_info) > 0:
175
+ print('Group information for the following genomes are missing, program exited!')
176
+ print(','.join(gnms_without_group_info))
177
+ print('Group information for the above genomes are missing, program exited!')
178
+ exit()
179
+
180
+ # read in provided cutoffs
181
+ assess_summary_1_txt = '%s/assess_by_PA.txt' % op_dir
182
+ assess_summary_2_txt = '%s/assess_by_PA_summary.txt' % op_dir
183
+ itol_binary_txt = '%s/assess_by_PA_iTOL_binary.txt' % op_dir
184
+
185
+ trimmed_aln_file_re = '%s/*.aln' % (trimmed_aln_dir)
186
+ trimmed_aln_file_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_aln_file_re)]
187
+
188
+ assess_summary_1_txt_handle = open(assess_summary_1_txt, 'w')
189
+ assess_summary_1_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in group_id_list_sorted]))
190
+ assess_summary_2_txt_handle = open(assess_summary_2_txt, 'w')
191
+ assess_summary_2_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in present_pct_cutoff_list]))
192
+ cutoff_to_qualified_marker_dict = dict()
193
+ gnm_to_identified_marker_dict = dict()
194
+ marker_id_list = []
195
+ for each_aln in trimmed_aln_file_list:
196
+
197
+ marker_id = each_aln.split(('.aln'))[0]
198
+ marker_id_list.append(marker_id)
199
+ pwd_aln = '%s/%s' % (trimmed_aln_dir, each_aln)
200
+
201
+ current_marker_num_by_group_dict = dict()
202
+ for each_seq in SeqIO.parse(pwd_aln, 'fasta'):
203
+ gnm_id = each_seq.id
204
+
205
+ # get genome to marker dist
206
+ if gnm_id not in gnm_to_identified_marker_dict:
207
+ gnm_to_identified_marker_dict[gnm_id] = {marker_id}
208
+ else:
209
+ gnm_to_identified_marker_dict[gnm_id].add(marker_id)
210
+
211
+ if gnm_id in gnm_to_group_dict:
212
+ gnm_group = gnm_to_group_dict[gnm_id]
213
+ if gnm_group not in current_marker_num_by_group_dict:
214
+ current_marker_num_by_group_dict[gnm_group] = 1
215
+ else:
216
+ current_marker_num_by_group_dict[gnm_group] += 1
217
+ else:
218
+ print('Not all genomes used to generate the MSA being found in -aa, program exited!')
219
+ exit()
220
+
221
+ # write out assess_summary_1_txt
222
+ pct_list = []
223
+ for each_grp in group_id_list_sorted:
224
+ grp_pct = current_marker_num_by_group_dict.get(each_grp, 0)*100/group_to_gnm_num_dict[each_grp]
225
+ grp_pct = float("{0:.2f}".format(grp_pct))
226
+ pct_list.append(grp_pct)
227
+ assess_summary_1_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join([str(i) for i in pct_list])))
228
+
229
+ # write out assess_summary_2_txt
230
+ assess_list = []
231
+ for each_cutoff in present_pct_cutoff_list:
232
+
233
+ good_marker = True
234
+ for each_pct in pct_list:
235
+ if each_pct < each_cutoff:
236
+ good_marker = False
237
+
238
+ if each_cutoff not in cutoff_to_qualified_marker_dict:
239
+ cutoff_to_qualified_marker_dict[each_cutoff] = {marker_id}
240
+
241
+ if good_marker is True:
242
+ assess_list.append('1')
243
+ cutoff_to_qualified_marker_dict[each_cutoff].add(marker_id)
244
+ else:
245
+ assess_list.append('0')
246
+ assess_summary_2_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join(assess_list)))
247
+
248
+ # write out total in assess_summary_2_txt
249
+ total_stats_list = [str(len(cutoff_to_qualified_marker_dict[each_c])) for each_c in present_pct_cutoff_list]
250
+ assess_summary_2_txt_handle.write('Total\t%s\n' % ('\t'.join(total_stats_list)))
251
+ assess_summary_1_txt_handle.close()
252
+ assess_summary_2_txt_handle.close()
253
+
254
+ # copy alignments of qualified marker to corresponding folders
255
+ for each_cutoff in cutoff_to_qualified_marker_dict:
256
+ qualified_marker_set = cutoff_to_qualified_marker_dict[each_cutoff]
257
+ pwd_qualified_marker_dir = '%s/marker_PA%s' % (op_dir, each_cutoff)
258
+ pwd_qualified_marker_id_txt = '%s/marker_PA%s_id.txt' % (op_dir, each_cutoff)
259
+
260
+ os.system('mkdir %s' % pwd_qualified_marker_dir)
261
+ for each_marker in qualified_marker_set:
262
+ pwd_marker_aln = '%s/%s.aln' % (trimmed_aln_dir, each_marker)
263
+ cp_cmd = 'cp %s %s/' % (pwd_marker_aln, pwd_qualified_marker_dir)
264
+ os.system(cp_cmd)
265
+
266
+ # write out id
267
+ with open(pwd_qualified_marker_id_txt, 'w') as pwd_qualified_marker_id_txt_handle:
268
+ pwd_qualified_marker_id_txt_handle.write('%s\n' % '\n'.join(qualified_marker_set))
269
+
270
+ # write out iTOL file
271
+ itol_binary_txt_handle = open(itol_binary_txt, 'w')
272
+ itol_binary_txt_handle.write('DATASET_BINARY\n\nSEPARATOR TAB\nDATASET_LABEL\tlabel1\nCOLOR\t#85C1E9\n')
273
+ itol_binary_txt_handle.write('SHOW_LABELS\t1\nLABEL_ROTATION\t45\nLABEL_SHIFT\t5\n')
274
+ itol_binary_txt_handle.write('FIELD_LABELS\t%s\n' % '\t'.join(sorted(marker_id_list)))
275
+ itol_binary_txt_handle.write('FIELD_SHAPES\t%s\n' % '\t'.join(['1']*len(marker_id_list)))
276
+ itol_binary_txt_handle.write('\nDATA\n')
277
+ for each_g in gnm_to_identified_marker_dict:
278
+ g_identified_marker_set = gnm_to_identified_marker_dict[each_g]
279
+
280
+ pa_list = []
281
+ for each_m in sorted(marker_id_list):
282
+ if each_m in g_identified_marker_set:
283
+ pa_list.append('1')
284
+ else:
285
+ pa_list.append('-1')
286
+ itol_binary_txt_handle.write('%s\t%s\n' % (each_g, '\t'.join(pa_list)))
287
+ itol_binary_txt_handle.close()
288
+
289
+ print('Assessment results exported to:\n%s\n%s' % (assess_summary_1_txt, assess_summary_2_txt))
290
+
291
+
292
+ def get_gap_stats(msa_in_fa, stats_txt):
293
+
294
+ gap_pct_dict = dict()
295
+ for each_seq in SeqIO.parse(msa_in_fa, 'fasta'):
296
+ seq_id = each_seq.id
297
+ seq_str = str(each_seq.seq)
298
+ gap_pct = seq_str.count('-')*100/len(seq_str)
299
+ gap_pct = float("{0:.2f}".format(gap_pct))
300
+ gap_pct_dict[seq_id] = gap_pct
301
+
302
+ gap_pct_sorted = sorted(gap_pct_dict.items(), key=lambda x:x[1])
303
+
304
+ stats_txt_handle = open(stats_txt, 'w')
305
+ stats_txt_handle.write('Sequence\tGap\n')
306
+ for each_seq in gap_pct_sorted:
307
+ stats_txt_handle.write('%s\t%s\n' % (each_seq[0], each_seq[1]))
308
+ stats_txt_handle.close()
309
+
310
+
311
+ def BMGE(msa_in, op_prefix, trim_model, entropy_score_cutoff):
312
+
313
+ # define file name
314
+ msa_out_phylip = '%s.BMGE.phylip' % op_prefix
315
+ msa_out_fasta = '%s.BMGE.fasta' % op_prefix
316
+ msa_out_nexus = '%s.BMGE.nexus' % op_prefix
317
+ msa_out_html = '%s.BMGE.html' % op_prefix
318
+
319
+ # specify path to BMGE.jar
320
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
321
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
322
+
323
+ # run BMGE
324
+ bmge_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -op %s -of %s -on %s -oh %s' % (pwd_bmge_jar, msa_in, trim_model, entropy_score_cutoff, msa_out_phylip, msa_out_fasta, msa_out_nexus, msa_out_html)
325
+ print('Running %s' % bmge_cmd)
326
+ os.system(bmge_cmd)
327
+
328
+
329
+ def MarkerRef2Tree(args):
330
+
331
+ faa_file_dir = args['i']
332
+ faa_file_ext = args['x']
333
+ marker_seq_dir = args['m']
334
+ marker_seq_ext = args['mx']
335
+ gnm_group_txt = args['g']
336
+ op_dir = args['o']
337
+ e_value = args['e']
338
+ num_of_threads = args['t']
339
+ force_overwrite = args['f']
340
+ pa_cutoff_str = args['c']
341
+ minimal_marker_number = args['mmn']
342
+ run_psiblast = args['psiblast']
343
+ trim_with_bmge = args['bmge']
344
+ bmge_trim_model = args['bmge_m']
345
+ bmge_entropy_score_cutoff = args['bmge_esc']
346
+
347
+ # check dependencies
348
+ check_dependencies(['java', 'psiblast', 'blastp', 'mafft-einsi', 'trimal', 'iqtree'])
349
+
350
+ # specify path to BMGE.jar
351
+ current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
352
+ pwd_bmge_jar = '%s/BMGE.jar' % current_file_path
353
+
354
+ # get marker id set
355
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
356
+ marker_seq_list = [os.path.basename(file_name) for file_name in glob.glob(marker_seq_re)]
357
+ marker_id_set = set()
358
+ for each_marker_seq_file in marker_seq_list:
359
+ _, marker_seq_basename, _ = sep_path_basename_ext(each_marker_seq_file)
360
+ marker_id_set.add(marker_seq_basename)
361
+
362
+ # get gnm id list
363
+ faa_file_re = '%s/*.%s' % (faa_file_dir, faa_file_ext)
364
+ faa_file_list = [os.path.basename(file_name) for file_name in glob.glob(faa_file_re)]
365
+ gnm_set = set()
366
+ for each_faa_file in faa_file_list:
367
+ faa_path, faa_basename, faa_ext = sep_path_basename_ext(each_faa_file)
368
+ gnm_set.add(faa_basename)
369
+
370
+ gnm_id_list_sorted = sorted([i for i in gnm_set])
371
+
372
+ #################### check genome grouping files ####################
373
+
374
+ group_to_gnm_dict = dict()
375
+ if gnm_group_txt is None:
376
+ group_to_gnm_dict['group_1'] = set()
377
+ for each_gnm in gnm_id_list_sorted:
378
+ group_to_gnm_dict['group_1'].add(each_gnm)
379
+ else:
380
+ if os.path.isfile(gnm_group_txt) is False:
381
+ print('Specified %s not found, program exited!' % gnm_group_txt)
382
+ exit()
383
+ else:
384
+ for each_gnm in open(gnm_group_txt):
385
+ each_gnm_split = each_gnm.strip().split('\t')
386
+ gnm_id = each_gnm_split[0]
387
+ domain_name = each_gnm_split[1]
388
+ if gnm_id in gnm_set:
389
+ if domain_name not in group_to_gnm_dict:
390
+ group_to_gnm_dict[domain_name] = {gnm_id}
391
+ else:
392
+ group_to_gnm_dict[domain_name].add(gnm_id)
393
+
394
+ ############################################## check file/folder name ##############################################
395
+
396
+ blastp_cmd_txt = '%s/s01_blast_cmds_%s.txt' % (op_dir, (len(marker_seq_list)*len(faa_file_list)))
397
+ pwd_combined_protein = '%s/combined.faa' % op_dir
398
+ blast_op_dir = '%s/s01_blast' % op_dir
399
+ best_hit_id_by_marker_dir = '%s/s02_marker_id' % op_dir
400
+ best_hit_seq_by_marker_dir = '%s/s03_marker_seq' % op_dir
401
+ best_hit_seq_by_marker_dir_renamed = '%s/s04_marker_seq_by_genome_name' % op_dir
402
+ best_hit_aln_by_marker_dir = '%s/s05_marker_aln' % op_dir
403
+ best_hit_aln_by_marker_dir_trimmed = '%s/s06_marker_aln_trimal' % op_dir
404
+ if trim_with_bmge is True:
405
+ best_hit_aln_by_marker_dir_trimmed = '%s/s06_marker_aln_BMGE' % op_dir
406
+ assess_marker_pa_dir = '%s/s07_assess_marker_PA' % op_dir
407
+ trimmed_msa_PA_concatenated_dir = '%s/s08_iqtree_wd' % op_dir
408
+ iqtree_dir = '%s/s08_iqtree_wd' % op_dir
409
+ iqtree_cmd_txt = '%s/iqtree_cmds.txt' % iqtree_dir
410
+
411
+ ####################################################################################################################
412
+
413
+ # create folder
414
+ if os.path.isdir(op_dir) is True:
415
+ if force_overwrite is True:
416
+ os.system('rm -r %s' % op_dir)
417
+ else:
418
+ print('Output folder detected, program exited!')
419
+ exit()
420
+
421
+ os.system('mkdir %s' % op_dir)
422
+ os.system('mkdir %s' % blast_op_dir)
423
+
424
+ # get blastp command
425
+ blast_cmd_list = []
426
+ blast_op_to_cmd_dict = dict()
427
+ blastp_cmd_txt_handle = open(blastp_cmd_txt, 'w')
428
+ for gnm_id in gnm_id_list_sorted:
429
+ for each_cog in marker_id_set:
430
+ pwd_blast_op = '%s/%s_vs_%s_blastp.txt' % (blast_op_dir, gnm_id, each_cog)
431
+
432
+ blast_cmd = 'blastp -subject %s/%s.%s -evalue %s -outfmt 6 -query %s/%s.%s -out %s' % (marker_seq_dir, each_cog, marker_seq_ext, e_value, faa_file_dir, gnm_id, faa_file_ext, pwd_blast_op)
433
+ if run_psiblast is True:
434
+ blast_cmd = 'psiblast -subject %s/%s.%s -evalue %s -outfmt 6 -query %s/%s.%s -out %s' % (marker_seq_dir, each_cog, marker_seq_ext, e_value, faa_file_dir, gnm_id, faa_file_ext, pwd_blast_op)
435
+
436
+ blast_op_to_cmd_dict[pwd_blast_op] = blast_cmd
437
+ blastp_cmd_txt_handle.write(blast_cmd + '\n')
438
+ blast_cmd_list.append(blast_cmd)
439
+ blastp_cmd_txt_handle.close()
440
+
441
+ # run blastp
442
+ if force_overwrite is True:
443
+ exe_cmds(blast_cmd_list, num_of_threads)
444
+ else:
445
+ cmds_to_rerun = []
446
+ num_of_good_ones = 0
447
+ for each_blast_op in blast_op_to_cmd_dict:
448
+
449
+ look_good = False
450
+ if os.path.isfile(each_blast_op) is True:
451
+ look_good = True
452
+ num_of_good_ones += 1
453
+
454
+ if look_good is False:
455
+ cmds_to_rerun.append(blast_op_to_cmd_dict[each_blast_op])
456
+
457
+ print('Detected blastp outputs: %s' % num_of_good_ones)
458
+ exe_cmds(cmds_to_rerun, num_of_threads)
459
+
460
+ # get best_hit_dict_by_marker
461
+ best_hit_to_gnm_dict = dict()
462
+ best_hit_dict_by_marker = dict()
463
+ for gnm_id in gnm_id_list_sorted:
464
+ for each_cog in marker_id_set:
465
+ current_blastp_op = '%s/%s_vs_%s_blastp.txt' % (blast_op_dir, gnm_id, each_cog)
466
+ # get best hit
467
+ if os.path.isfile(current_blastp_op) is True:
468
+ best_hit_gene = ''
469
+ best_hit_score = 0
470
+ for each_line in open(current_blastp_op):
471
+ each_line_split = each_line.strip().split('\t')
472
+ query_id = each_line_split[0]
473
+ bit_score = float(each_line_split[11])
474
+ if bit_score > best_hit_score:
475
+ best_hit_score = bit_score
476
+ best_hit_gene = query_id
477
+
478
+ if best_hit_gene != '':
479
+ best_hit_to_gnm_dict[best_hit_gene] = gnm_id
480
+
481
+ if each_cog not in best_hit_dict_by_marker:
482
+ best_hit_dict_by_marker[each_cog] = [best_hit_gene]
483
+ else:
484
+ best_hit_dict_by_marker[each_cog].append(best_hit_gene)
485
+
486
+ # create dir
487
+ if os.path.isdir(best_hit_id_by_marker_dir) is False:
488
+ os.system('mkdir %s' % best_hit_id_by_marker_dir)
489
+ if os.path.isdir(best_hit_seq_by_marker_dir) is False:
490
+ os.system('mkdir %s' % best_hit_seq_by_marker_dir)
491
+ if os.path.isdir(best_hit_seq_by_marker_dir_renamed) is False:
492
+ os.system('mkdir %s' % best_hit_seq_by_marker_dir_renamed)
493
+ if os.path.isdir(best_hit_aln_by_marker_dir) is False:
494
+ os.system('mkdir %s' % best_hit_aln_by_marker_dir)
495
+ if os.path.isdir(best_hit_aln_by_marker_dir_trimmed) is False:
496
+ os.system('mkdir %s' % best_hit_aln_by_marker_dir_trimmed)
497
+
498
+ os.system('cat %s/*.%s > %s' % (faa_file_dir, faa_file_ext, pwd_combined_protein))
499
+
500
+ processing_index = 1
501
+ for each_marker in best_hit_dict_by_marker:
502
+ print('Processing (extract sequence, align and trim) marker %s/%s: %s' % (processing_index, len(best_hit_dict_by_marker), each_marker))
503
+ processing_index += 1
504
+
505
+ current_m_hit_list = best_hit_dict_by_marker[each_marker]
506
+ marker_hits_txt = ('%s/%s.txt' % (best_hit_id_by_marker_dir, each_marker)).replace(':', '')
507
+ marker_hits_seq = ('%s/%s.fa' % (best_hit_seq_by_marker_dir, each_marker)).replace(':', '')
508
+ marker_hits_seq_renamed = ('%s/%s.fa' % (best_hit_seq_by_marker_dir_renamed, each_marker)).replace(':', '')
509
+ marker_hits_aln = ('%s/%s.aln' % (best_hit_aln_by_marker_dir, each_marker)).replace(':', '')
510
+ marker_hits_aln_trimmed = ('%s/%s.aln' % (best_hit_aln_by_marker_dir_trimmed, each_marker)).replace(':', '')
511
+
512
+ with open(marker_hits_txt, 'w') as marker_hits_txt_handle:
513
+ marker_hits_txt_handle.write('\n'.join(current_m_hit_list))
514
+
515
+ # extract sequences
516
+ select_seq(pwd_combined_protein, marker_hits_txt, 1, marker_hits_seq, True, False)
517
+
518
+ # rename sequences
519
+ marker_hits_seq_renamed_handle = open(marker_hits_seq_renamed, 'w')
520
+ for each_seq in SeqIO.parse(marker_hits_seq, 'fasta'):
521
+ seq_id = each_seq.id
522
+ seq_gnm = best_hit_to_gnm_dict[seq_id]
523
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
524
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
525
+ marker_hits_seq_renamed_handle.close()
526
+
527
+ # run mafft-einsi
528
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_threads, marker_hits_seq_renamed, marker_hits_aln)
529
+ os.system(mafft_cmd)
530
+
531
+ # trim msa
532
+ trim_cmd = 'trimal -in %s -out %s -automated1' % (marker_hits_aln, marker_hits_aln_trimmed)
533
+ if trim_with_bmge is True:
534
+ trim_cmd = 'java -jar %s -i %s -m %s -t AA -h %s -of %s' % (pwd_bmge_jar, marker_hits_aln, bmge_trim_model, bmge_entropy_score_cutoff, marker_hits_aln_trimmed)
535
+ os.system(trim_cmd)
536
+
537
+ ########## Assess marker by PA ##########
538
+
539
+ present_pct_cutoff_list = [int(i) for i in pa_cutoff_str.split(',')]
540
+
541
+ # create output dir
542
+ if os.path.isdir(assess_marker_pa_dir) is True:
543
+ os.system('rm -r %s' % assess_marker_pa_dir)
544
+ os.system('mkdir %s' % assess_marker_pa_dir)
545
+
546
+ AssessMarkerPA(best_hit_aln_by_marker_dir_trimmed, gnm_set, group_to_gnm_dict, present_pct_cutoff_list, assess_marker_pa_dir)
547
+
548
+ ########## concatenate marker ##########
549
+
550
+ # create output dir
551
+ if os.path.isdir(trimmed_msa_PA_concatenated_dir) is True:
552
+ os.system('rm -r %s' % trimmed_msa_PA_concatenated_dir)
553
+ os.system('mkdir %s' % trimmed_msa_PA_concatenated_dir)
554
+
555
+ qualified_cutoff_list = []
556
+ for each_c in present_pct_cutoff_list:
557
+ current_cutoff_trimmed_msa_dir = '%s/marker_PA%s' % (assess_marker_pa_dir, each_c)
558
+ trimmed_msa_re = '%s/*.aln' % current_cutoff_trimmed_msa_dir
559
+ trimmed_msa_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_msa_re)]
560
+
561
+ if len(trimmed_msa_list) < minimal_marker_number:
562
+ print('The number of qualified marker under PA cutoff %s: %s, skipped!' % (each_c, len(trimmed_msa_list)))
563
+ else:
564
+ qualified_cutoff_list.append(each_c)
565
+ pwd_concatenated_marker_phy = '%s/marker_pa%s.phy' % (trimmed_msa_PA_concatenated_dir, each_c)
566
+ pwd_concatenated_marker_fasta = '%s/marker_pa%s.fasta' % (trimmed_msa_PA_concatenated_dir, each_c)
567
+ pwd_concatenated_marker_partition = '%s/marker_pa%s_partition.txt' % (trimmed_msa_PA_concatenated_dir, each_c)
568
+ catfasta2phy(current_cutoff_trimmed_msa_dir, 'aln', pwd_concatenated_marker_phy, pwd_concatenated_marker_fasta, pwd_concatenated_marker_partition)
569
+
570
+ ########## get guide tree and C60+PMSF tree for each set of marker set ##########
571
+
572
+ iqtree_cmd_txt_handle = open(iqtree_cmd_txt, 'w')
573
+ iqtree_cmd_list = []
574
+ for each_c in qualified_cutoff_list:
575
+ os.system('mkdir %s/PA%s_guide_tree' % ((iqtree_dir, each_c)))
576
+ os.system('mkdir %s/PA%s_PMSF_C60_tree' % ((iqtree_dir, each_c)))
577
+
578
+ msa_to_use = 'marker_pa%s.fasta' % each_c
579
+
580
+ get_guide_tree_cmd = 'iqtree2 --seqtype AA -B 1000 --alrt 1000 --quiet -T %s -s %s --prefix PA%s_guide_tree/PA%s_guide_tree -m LG' % (num_of_threads, msa_to_use, each_c, each_c)
581
+ get_c60_tree_cmd = 'iqtree2 --seqtype AA -B 1000 --alrt 1000 --quiet -T %s -s %s --prefix PA%s_PMSF_C60_tree/PA%s_PMSF_C60 -m LG+C60+F+G -ft PA%s_guide_tree/PA%s_guide_tree.treefile' % (num_of_threads, msa_to_use, each_c, each_c, each_c, each_c)
582
+ cmds_in_one_line = '%s; %s' % (get_guide_tree_cmd, get_c60_tree_cmd)
583
+ iqtree_cmd_txt_handle.write('%s\n' % cmds_in_one_line)
584
+ iqtree_cmd_list.append(cmds_in_one_line)
585
+ iqtree_cmd_txt_handle.close()
586
+
587
+ # run iqtree
588
+ os.chdir(iqtree_dir)
589
+ for each_cmd in iqtree_cmd_list:
590
+ print('Running: %s' % each_cmd)
591
+ os.system(each_cmd)
592
+
593
+ print('Done!')
594
+
595
+
596
+ if __name__ == '__main__':
597
+
598
+ # initialize the options parser
599
+ parser = argparse.ArgumentParser()
600
+ parser.add_argument('-i', required=True, help='faa dir')
601
+ parser.add_argument('-x', required=False, default='faa', help='faa file extension, default: faa')
602
+ parser.add_argument('-m', required=True, help='marker seq dir, file extension need to be faa')
603
+ parser.add_argument('-mx', required=False, default='faa', help='marker seq file extension, default: faa')
604
+ parser.add_argument('-g', required=False, default=None, help='genome group')
605
+ parser.add_argument('-c', required=False, default='85', help='presence-absence cutoffs, default: 85')
606
+ parser.add_argument('-o', required=True, help='output dir')
607
+ parser.add_argument('-e', required=False, default='1e-30', help='e-value cutoff, default: 1e-30')
608
+ parser.add_argument('-t', required=True, type=int, help='num of threads')
609
+ parser.add_argument('-mmn', required=False, default=1, type=int, help='minimal marker number, default: 1')
610
+ parser.add_argument('-psiblast', required=False, action="store_true", help='run psiblast')
611
+ parser.add_argument('-bmge', required=False, action="store_true", help='trim MSA with BMGE, default is trimal')
612
+ parser.add_argument('-bmge_m', required=False, default='BLOSUM30', help='BMGE trim model, default: BLOSUM30')
613
+ parser.add_argument('-bmge_esc', required=False, default='0.55', help='BMGE entropy score cutoff, default: 0.55')
614
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
615
+ args = vars(parser.parse_args())
616
+ MarkerRef2Tree(args)