treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,628 @@
1
+ import os
2
+ import glob
3
+ import argparse
4
+ from Bio import SeqIO
5
+ from Bio import AlignIO
6
+ import multiprocessing as mp
7
+ from distutils.spawn import find_executable
8
+
9
+
10
+ MarkerRef2Tree_usage = '''
11
+ ============================= MarkerRef2Tree example commands =============================
12
+
13
+ Dependencies: blastp, mafft-einsi, trimal, iqtree
14
+
15
+ # example commands
16
+ TreeSAK MarkerRef2Tree -m marker_seq -mx fa -aa gnm_faa_files -aax faa -o op_dir -e 30 -t 6
17
+
18
+ ===========================================================================================
19
+ '''
20
+
21
+
22
+ def check_dependencies(program_list):
23
+ # check whether executables exist
24
+ not_detected_programs = []
25
+ for needed_program in program_list:
26
+ if find_executable(needed_program) is None:
27
+ not_detected_programs.append(needed_program)
28
+
29
+ if not_detected_programs != []:
30
+ print('%s not found, program exited!' % ','.join(not_detected_programs))
31
+ exit()
32
+
33
+
34
+ def exe_cmds(cmd_list, num_threads):
35
+ print('Running %s commands with %s cores' % (len(cmd_list), num_threads))
36
+ pool = mp.Pool(processes=num_threads)
37
+ pool.map(os.system, cmd_list)
38
+ pool.close()
39
+ pool.join()
40
+
41
+
42
+ def sep_path_basename_ext(file_in):
43
+ file_path, file_name = os.path.split(file_in)
44
+ if file_path == '':
45
+ file_path = '.'
46
+ file_basename, file_extension = os.path.splitext(file_name)
47
+ return file_path, file_basename, file_extension
48
+
49
+
50
+ def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
51
+
52
+ concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
53
+ msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
54
+ msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
55
+ msa_file_list_sorted = sorted(msa_file_list)
56
+
57
+ complete_gnm_set = set()
58
+ for each_msa_file in msa_file_list:
59
+ pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
60
+ for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
61
+ complete_gnm_set.add(each_seq.id)
62
+
63
+ complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
64
+
65
+ # initialize concatenated msa dict
66
+ gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
67
+ msa_len_dict = dict()
68
+ for each_msa_file in msa_file_list_sorted:
69
+ gene_id = each_msa_file.split('.' + msa_ext)[0]
70
+
71
+ # read in msa
72
+ current_msa_len = 0
73
+ current_msa_len_set = set()
74
+ pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
75
+ current_msa_seq_dict = dict()
76
+ for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
77
+ complete_gnm_set.add(each_seq.id)
78
+ current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
79
+ current_msa_len_set.add(len(each_seq.seq))
80
+ current_msa_len = len(each_seq.seq)
81
+
82
+ if len(current_msa_len_set) != 1:
83
+ print('Sequences with different length were found in %s, program exited!' % each_msa_file)
84
+ exit()
85
+
86
+ msa_len_dict[gene_id] = current_msa_len
87
+
88
+ # add sequence to concatenated msa dict
89
+ for each_gnm in complete_gnm_list_sorted:
90
+ msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
91
+ gnm_to_seq_dict[each_gnm] += msa_seq
92
+
93
+ # write out concatenated msa
94
+ concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
95
+ for each_gnm in complete_gnm_list_sorted:
96
+ concatenated_msa_handle.write('>%s\n' % each_gnm)
97
+ concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
98
+ concatenated_msa_handle.close()
99
+
100
+ # write out partition file
101
+ end_pos = 0
102
+ partition_file_handle = open(partition_file, 'w')
103
+ for each_m in msa_file_list_sorted:
104
+ gene_id = each_m.split('.' + msa_ext)[0]
105
+ current_m_len = msa_len_dict[gene_id]
106
+ partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
107
+ end_pos += current_m_len
108
+ partition_file_handle.close()
109
+
110
+ # convert msa in fasta to phy
111
+ AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
112
+
113
+
114
+ def select_seq(seq_file, id_file,select_option, output_file, one_line, in_fastq):
115
+
116
+ # get provided id list
117
+ seq_id_list = set()
118
+ for seq_id in open(id_file):
119
+ seq_id_list.add(seq_id.strip())
120
+
121
+ seq_in_format = 'fasta'
122
+ if in_fastq is True:
123
+ seq_in_format = 'fastq'
124
+
125
+ # extract sequences
126
+ output_file_handle = open(output_file, 'w')
127
+ for seq_record in SeqIO.parse(seq_file, seq_in_format):
128
+ seq_id = seq_record.id
129
+ if select_option == 1:
130
+ if seq_id in seq_id_list:
131
+
132
+ if in_fastq is False:
133
+ if one_line is False:
134
+ SeqIO.write(seq_record, output_file_handle, 'fasta')
135
+ else:
136
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
137
+ else:
138
+ SeqIO.write(seq_record, output_file_handle, 'fastq')
139
+
140
+ if select_option == 0:
141
+ if seq_id not in seq_id_list:
142
+
143
+ if in_fastq is False:
144
+ if one_line is False:
145
+ SeqIO.write(seq_record, output_file_handle, 'fasta')
146
+ else:
147
+ SeqIO.write(seq_record, output_file_handle, 'fasta-2line')
148
+ else:
149
+ SeqIO.write(seq_record, output_file_handle, 'fastq')
150
+ output_file_handle.close()
151
+
152
+
153
+ def AssessMarkerPA(trimmed_aln_dir, gnm_set, gnm_group_txt, present_pct_cutoff_list, op_dir):
154
+
155
+ # read in genome metadata
156
+ group_to_gnm_dict = dict()
157
+ group_to_gnm_num_dict = dict()
158
+ gnm_to_group_dict = dict()
159
+ for each_gnm in open(gnm_group_txt):
160
+ each_gnm_split = each_gnm.strip().split('\t')
161
+ gnm_id = each_gnm_split[0]
162
+ domain_name = each_gnm_split[1]
163
+
164
+ if gnm_id in gnm_set:
165
+ gnm_to_group_dict[gnm_id] = domain_name
166
+
167
+ if domain_name not in group_to_gnm_num_dict:
168
+ group_to_gnm_num_dict[domain_name] = 1
169
+ else:
170
+ group_to_gnm_num_dict[domain_name] += 1
171
+
172
+ if domain_name not in group_to_gnm_dict:
173
+ group_to_gnm_dict[domain_name] = {gnm_id}
174
+ else:
175
+ group_to_gnm_dict[domain_name].add(gnm_id)
176
+
177
+ group_id_list_sorted = sorted(list(group_to_gnm_dict.keys()))
178
+
179
+ # exit program if group information is missing
180
+ gnms_without_group_info = set()
181
+ for gnm in gnm_set:
182
+ if gnm not in gnm_to_group_dict:
183
+ gnms_without_group_info.add(gnm)
184
+
185
+ if len(gnms_without_group_info) > 0:
186
+ print('Group information for the following genomes are missing from %s, program exited!' % gnm_group_txt)
187
+ print(','.join(gnms_without_group_info))
188
+ print('Group information for the above genomes are missing from %s, program exited!' % gnm_group_txt)
189
+ exit()
190
+
191
+ # read in provided cutoffs
192
+ assess_summary_1_txt = '%s/assessment_PA.txt' % op_dir
193
+ assess_summary_2_txt = '%s/assessment_PA_summary.txt' % op_dir
194
+ itol_binary_txt = '%s/assessment_PA_iTOL_binary.txt' % op_dir
195
+
196
+ trimmed_aln_file_re = '%s/*.aln' % (trimmed_aln_dir)
197
+ trimmed_aln_file_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_aln_file_re)]
198
+
199
+ assess_summary_1_txt_handle = open(assess_summary_1_txt, 'w')
200
+ assess_summary_1_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in group_id_list_sorted]))
201
+ assess_summary_2_txt_handle = open(assess_summary_2_txt, 'w')
202
+ assess_summary_2_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in present_pct_cutoff_list]))
203
+ cutoff_to_qualified_marker_dict = dict()
204
+ gnm_to_identified_marker_dict = dict()
205
+ marker_id_list = []
206
+ for each_aln in trimmed_aln_file_list:
207
+
208
+ marker_id = each_aln.split(('.aln'))[0]
209
+ marker_id_list.append(marker_id)
210
+ pwd_aln = '%s/%s' % (trimmed_aln_dir, each_aln)
211
+
212
+ current_marker_num_by_group_dict = dict()
213
+ for each_seq in SeqIO.parse(pwd_aln, 'fasta'):
214
+ gnm_id = each_seq.id
215
+
216
+ # get genome to marker dist
217
+ if gnm_id not in gnm_to_identified_marker_dict:
218
+ gnm_to_identified_marker_dict[gnm_id] = {marker_id}
219
+ else:
220
+ gnm_to_identified_marker_dict[gnm_id].add(marker_id)
221
+
222
+ if gnm_id in gnm_to_group_dict:
223
+ gnm_group = gnm_to_group_dict[gnm_id]
224
+ if gnm_group not in current_marker_num_by_group_dict:
225
+ current_marker_num_by_group_dict[gnm_group] = 1
226
+ else:
227
+ current_marker_num_by_group_dict[gnm_group] += 1
228
+ else:
229
+ print('Not all genomes used to generate the MSA being found in -aa, program exited!')
230
+ exit()
231
+
232
+ # write out assess_summary_1_txt
233
+ pct_list = []
234
+ for each_grp in group_id_list_sorted:
235
+ grp_pct = current_marker_num_by_group_dict.get(each_grp, 0)*100/group_to_gnm_num_dict[each_grp]
236
+ grp_pct = float("{0:.2f}".format(grp_pct))
237
+ pct_list.append(grp_pct)
238
+ assess_summary_1_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join([str(i) for i in pct_list])))
239
+
240
+ # write out assess_summary_2_txt
241
+ assess_list = []
242
+ for each_cutoff in present_pct_cutoff_list:
243
+
244
+ good_marker = True
245
+ for each_pct in pct_list:
246
+ if each_pct < each_cutoff:
247
+ good_marker = False
248
+
249
+ if each_cutoff not in cutoff_to_qualified_marker_dict:
250
+ cutoff_to_qualified_marker_dict[each_cutoff] = {marker_id}
251
+
252
+ if good_marker is True:
253
+ assess_list.append('1')
254
+ cutoff_to_qualified_marker_dict[each_cutoff].add(marker_id)
255
+ else:
256
+ assess_list.append('0')
257
+ assess_summary_2_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join(assess_list)))
258
+
259
+ # write out total in assess_summary_2_txt
260
+ total_stats_list = [str(len(cutoff_to_qualified_marker_dict[each_c])) for each_c in present_pct_cutoff_list]
261
+ assess_summary_2_txt_handle.write('Total\t%s\n' % ('\t'.join(total_stats_list)))
262
+ assess_summary_1_txt_handle.close()
263
+ assess_summary_2_txt_handle.close()
264
+
265
+ # copy alignments of qualified marker to corresponding folders
266
+ for each_cutoff in cutoff_to_qualified_marker_dict:
267
+ qualified_marker_set = cutoff_to_qualified_marker_dict[each_cutoff]
268
+ pwd_qualified_marker_dir = '%s/qualified_marker_PA_%s' % (op_dir, each_cutoff)
269
+ pwd_qualified_marker_id_txt = '%s/qualified_marker_PA_%s_id.txt' % (op_dir, each_cutoff)
270
+
271
+ os.system('mkdir %s' % pwd_qualified_marker_dir)
272
+ for each_marker in qualified_marker_set:
273
+ pwd_marker_aln = '%s/%s.aln' % (trimmed_aln_dir, each_marker)
274
+ cp_cmd = 'cp %s %s/' % (pwd_marker_aln, pwd_qualified_marker_dir)
275
+ os.system(cp_cmd)
276
+
277
+ # write out id
278
+ with open(pwd_qualified_marker_id_txt, 'w') as pwd_qualified_marker_id_txt_handle:
279
+ pwd_qualified_marker_id_txt_handle.write('%s\n' % '\n'.join(qualified_marker_set))
280
+
281
+ # write out iTOL file
282
+ itol_binary_txt_handle = open(itol_binary_txt, 'w')
283
+ itol_binary_txt_handle.write('DATASET_BINARY\n\nSEPARATOR TAB\nDATASET_LABEL\tlabel1\nCOLOR\t#85C1E9\n')
284
+ itol_binary_txt_handle.write('SHOW_LABELS\t1\nLABEL_ROTATION\t45\nLABEL_SHIFT\t5\n')
285
+ itol_binary_txt_handle.write('FIELD_LABELS\t%s\n' % '\t'.join(sorted(marker_id_list)))
286
+ itol_binary_txt_handle.write('FIELD_SHAPES\t%s\n' % '\t'.join(['1']*len(marker_id_list)))
287
+ itol_binary_txt_handle.write('\nDATA\n')
288
+ for each_g in gnm_to_identified_marker_dict:
289
+ g_identified_marker_set = gnm_to_identified_marker_dict[each_g]
290
+
291
+ pa_list = []
292
+ for each_m in sorted(marker_id_list):
293
+ if each_m in g_identified_marker_set:
294
+ pa_list.append('1')
295
+ else:
296
+ pa_list.append('-1')
297
+ itol_binary_txt_handle.write('%s\t%s\n' % (each_g, '\t'.join(pa_list)))
298
+ itol_binary_txt_handle.close()
299
+
300
+ print('Assessment results exported to:\n%s\n%s' % (assess_summary_1_txt, assess_summary_2_txt))
301
+
302
+
303
+ def MarkerRef2Tree(args):
304
+
305
+ marker_seq_dir = args['m']
306
+ marker_seq_ext = args['mx']
307
+ faa_file_dir = args['aa']
308
+ faa_file_ext = args['aax']
309
+ gnm_group_txt = args['g']
310
+ op_dir = args['o']
311
+ e_value = args['e']
312
+ num_of_threads = args['t']
313
+ force_overwrite = args['f']
314
+ js_cpu_num = args['jst']
315
+ pa_cutoff_str = args['pac']
316
+ skip_align_trim = args['skip_align_trim']
317
+ submit_job = args['qsub']
318
+ minimal_marker_number = args['mmn']
319
+ print(marker_seq_ext)
320
+ # check dependencies
321
+ check_dependencies(['blastp', 'mafft-einsi', 'trimal', 'iqtree'])
322
+
323
+ # check input files
324
+ if os.path.isfile(gnm_group_txt) is False:
325
+ print('%s not found, program exited!' % gnm_group_txt)
326
+ exit()
327
+
328
+ # get marker id set
329
+ marker_seq_re = '%s/*.%s' % (marker_seq_dir, marker_seq_ext)
330
+ marker_seq_list = [os.path.basename(file_name) for file_name in glob.glob(marker_seq_re)]
331
+ marker_id_set = set()
332
+ for each_marker_seq_file in marker_seq_list:
333
+ _, marker_seq_basename, _ = sep_path_basename_ext(each_marker_seq_file)
334
+ marker_id_set.add(marker_seq_basename)
335
+
336
+ # get gnm id list
337
+ faa_file_re = '%s/*.%s' % (faa_file_dir, faa_file_ext)
338
+ faa_file_list = [os.path.basename(file_name) for file_name in glob.glob(faa_file_re)]
339
+ gnm_set = set()
340
+ for each_faa_file in faa_file_list:
341
+ faa_path, faa_basename, faa_ext = sep_path_basename_ext(each_faa_file)
342
+ gnm_set.add(faa_basename)
343
+ gnm_id_list_sorted = sorted([i for i in gnm_set])
344
+
345
+ # define output dir
346
+ blastp_cmd_txt = '%s/blastp_cmds_%s.txt' % (op_dir, (len(gnm_id_list_sorted)*len(marker_id_set)))
347
+ pwd_combined_protein = '%s/combined.faa' % op_dir
348
+ blast_op_dir = '%s/s01_blast_op' % op_dir
349
+ best_hit_id_by_marker_dir = '%s/s02_identified_marker_id' % op_dir
350
+ best_hit_seq_by_marker_dir = '%s/s03_identified_marker_seq' % op_dir
351
+ best_hit_seq_by_marker_dir_renamed = '%s/s04_identified_marker_seq_renamed' % op_dir
352
+ best_hit_aln_by_marker_dir = '%s/s05_identified_marker_aln' % op_dir
353
+ best_hit_aln_by_marker_dir_trimmed = '%s/s06_identified_marker_aln_trimmed' % op_dir
354
+ assess_marker_pa_dir = '%s/s07_assess_marker_PA' % op_dir
355
+ trimmed_msa_PA_concatenated_dir = '%s/s08_marker_after_PA_concatenated' % op_dir
356
+ iqtree_dir = '%s/s09_iqtree_for_deltaLL' % op_dir
357
+ deltall_dir = '%s/s10_assess_marker_deltaLL' % op_dir
358
+
359
+ # create folder
360
+ if force_overwrite is True:
361
+ if os.path.isdir(op_dir) is True:
362
+ os.system('rm -r %s' % op_dir)
363
+ os.system('mkdir %s' % op_dir)
364
+ os.system('mkdir %s' % blast_op_dir)
365
+ else:
366
+ if os.path.isdir(op_dir) is False:
367
+ os.system('mkdir %s' % op_dir)
368
+ if os.path.isdir(blast_op_dir) is False:
369
+ os.system('mkdir %s' % blast_op_dir)
370
+
371
+ os.system('cat %s/*.%s > %s' % (faa_file_dir, faa_file_ext, pwd_combined_protein))
372
+
373
+ # get blastp command
374
+ blast_cmd_list = []
375
+ blast_op_to_cmd_dict = dict()
376
+ blastp_cmd_txt_handle = open(blastp_cmd_txt, 'w')
377
+ for gnm_id in gnm_id_list_sorted:
378
+ for each_cog in marker_id_set:
379
+ #blastp_cmd = 'blastp -subject /home-user/wzsong/DateArTree/02_Williams_2017_45_arCOG/%s.fa -evalue 1e-30 -outfmt 6 -query /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files/%s.faa -out %s/%s_vs_%s_blastp.txt' % (each_cog, gnm_id, blast_op, gnm_id, each_cog)
380
+ pwd_blast_op = '%s/%s_vs_%s_blastp.txt' % (blast_op_dir, gnm_id, each_cog)
381
+ blastp_cmd = 'blastp -subject %s/%s.%s -evalue %s -outfmt 6 -query %s/%s.faa -out %s' % (marker_seq_dir, each_cog, marker_seq_ext, e_value, faa_file_dir, gnm_id, pwd_blast_op)
382
+ print(marker_seq_ext)
383
+ print(blastp_cmd)
384
+ blast_op_to_cmd_dict[pwd_blast_op] = blastp_cmd
385
+ blastp_cmd_txt_handle.write(blastp_cmd + '\n')
386
+ blast_cmd_list.append(blastp_cmd)
387
+ blastp_cmd_txt_handle.close()
388
+
389
+ # run blastp
390
+ if force_overwrite is True:
391
+ exe_cmds(blast_cmd_list, num_of_threads)
392
+ else:
393
+ cmds_to_rerun = []
394
+ num_of_good_ones = 0
395
+ for each_blast_op in blast_op_to_cmd_dict:
396
+
397
+ look_good = False
398
+ if os.path.isfile(each_blast_op) is True:
399
+ look_good = True
400
+ num_of_good_ones += 1
401
+
402
+ if look_good is False:
403
+ cmds_to_rerun.append(blast_op_to_cmd_dict[each_blast_op])
404
+
405
+ print('Detected blastp outputs: %s' % num_of_good_ones)
406
+ exe_cmds(cmds_to_rerun, num_of_threads)
407
+
408
+ # get best_hit_dict_by_marker
409
+ best_hit_to_gnm_dict = dict()
410
+ best_hit_dict_by_marker = dict()
411
+ for gnm_id in gnm_id_list_sorted:
412
+ for each_cog in marker_id_set:
413
+ current_blastp_op = '%s/%s_vs_%s_blastp.txt' % (blast_op_dir, gnm_id, each_cog)
414
+ # get best hit
415
+ if os.path.isfile(current_blastp_op) is True:
416
+ best_hit_gene = ''
417
+ best_hit_score = 0
418
+ for each_line in open(current_blastp_op):
419
+ each_line_split = each_line.strip().split('\t')
420
+ query_id = each_line_split[0]
421
+ bit_score = float(each_line_split[11])
422
+ if bit_score > best_hit_score:
423
+ best_hit_score = bit_score
424
+ best_hit_gene = query_id
425
+
426
+ if best_hit_gene != '':
427
+ best_hit_to_gnm_dict[best_hit_gene] = gnm_id
428
+
429
+ if each_cog not in best_hit_dict_by_marker:
430
+ best_hit_dict_by_marker[each_cog] = [best_hit_gene]
431
+ else:
432
+ best_hit_dict_by_marker[each_cog].append(best_hit_gene)
433
+
434
+ # create output dir
435
+ if os.path.isdir(best_hit_id_by_marker_dir) is False:
436
+ os.system('mkdir %s' % best_hit_id_by_marker_dir)
437
+ if os.path.isdir(best_hit_seq_by_marker_dir) is False:
438
+ os.system('mkdir %s' % best_hit_seq_by_marker_dir)
439
+ if os.path.isdir(best_hit_seq_by_marker_dir_renamed) is False:
440
+ os.system('mkdir %s' % best_hit_seq_by_marker_dir_renamed)
441
+ if os.path.isdir(best_hit_aln_by_marker_dir) is False:
442
+ os.system('mkdir %s' % best_hit_aln_by_marker_dir)
443
+ if os.path.isdir(best_hit_aln_by_marker_dir_trimmed) is False:
444
+ os.system('mkdir %s' % best_hit_aln_by_marker_dir_trimmed)
445
+
446
+ # write out best hits and extract sequences
447
+ if skip_align_trim is True:
448
+ print('Skipping the extraction, alignment and trimming of markers')
449
+ else:
450
+ processing_index = 1
451
+ for each_marker in best_hit_dict_by_marker:
452
+ print('Processing (extract, align and trim) marker %s/%s: %s' % (processing_index, len(best_hit_dict_by_marker), each_marker))
453
+ processing_index += 1
454
+
455
+ current_m_hit_list = best_hit_dict_by_marker[each_marker]
456
+ marker_hits_txt = ('%s/%s.txt' % (best_hit_id_by_marker_dir, each_marker)).replace(':', '')
457
+ marker_hits_seq = ('%s/%s.fa' % (best_hit_seq_by_marker_dir, each_marker)).replace(':', '')
458
+ marker_hits_seq_renamed = ('%s/%s.fa' % (best_hit_seq_by_marker_dir_renamed, each_marker)).replace(':', '')
459
+ marker_hits_aln = ('%s/%s.aln' % (best_hit_aln_by_marker_dir, each_marker)).replace(':', '')
460
+ marker_hits_aln_trimmed = ('%s/%s.aln' % (best_hit_aln_by_marker_dir_trimmed, each_marker)).replace(':', '')
461
+
462
+ with open(marker_hits_txt, 'w') as marker_hits_txt_handle:
463
+ marker_hits_txt_handle.write('\n'.join(current_m_hit_list))
464
+
465
+ # extract sequences
466
+ select_seq(pwd_combined_protein, marker_hits_txt, 1, marker_hits_seq, True, False)
467
+
468
+ # rename sequences
469
+ marker_hits_seq_renamed_handle = open(marker_hits_seq_renamed, 'w')
470
+ for each_seq in SeqIO.parse(marker_hits_seq, 'fasta'):
471
+ seq_id = each_seq.id
472
+ seq_gnm = best_hit_to_gnm_dict[seq_id]
473
+ marker_hits_seq_renamed_handle.write('>%s\n' % seq_gnm)
474
+ marker_hits_seq_renamed_handle.write('%s\n' % str(each_seq.seq))
475
+ marker_hits_seq_renamed_handle.close()
476
+
477
+ # run mafft-einsi
478
+ mafft_cmd = 'mafft-einsi --thread %s --quiet %s > %s' % (num_of_threads, marker_hits_seq_renamed, marker_hits_aln)
479
+ #print('running: ' + mafft_cmd)
480
+ os.system(mafft_cmd)
481
+
482
+ # trim msa
483
+ trimal_cmd = 'trimal -in %s -out %s -automated1' % (marker_hits_aln, marker_hits_aln_trimmed)
484
+ #print('running: ' + trimal_cmd)
485
+ os.system(trimal_cmd)
486
+
487
+ ########## Assess marker by PA ##########
488
+
489
+ present_pct_cutoff_list = [int(i) for i in pa_cutoff_str.split('-')]
490
+
491
+ # create output dir
492
+ if os.path.isdir(assess_marker_pa_dir) is True:
493
+ os.system('rm -r %s' % assess_marker_pa_dir)
494
+ os.system('mkdir %s' % assess_marker_pa_dir)
495
+
496
+ # Assess marker by PA
497
+ AssessMarkerPA(best_hit_aln_by_marker_dir_trimmed, gnm_set, gnm_group_txt, present_pct_cutoff_list, assess_marker_pa_dir)
498
+
499
+ ########## concatenate marker set ##########
500
+
501
+ # create output dir
502
+ if os.path.isdir(trimmed_msa_PA_concatenated_dir) is True:
503
+ os.system('rm -r %s' % trimmed_msa_PA_concatenated_dir)
504
+ os.system('mkdir %s' % trimmed_msa_PA_concatenated_dir)
505
+
506
+ qualified_cutoff_list = []
507
+ for each_cutoff in present_pct_cutoff_list:
508
+ current_cutoff_trimmed_msa_dir = '%s/qualified_marker_PA_%s' % (assess_marker_pa_dir, each_cutoff)
509
+ trimmed_msa_re = '%s/*.aln' % current_cutoff_trimmed_msa_dir
510
+ trimmed_msa_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_msa_re)]
511
+
512
+ if len(trimmed_msa_list) < minimal_marker_number:
513
+ print('The number of qualified marker under PA cutoff %s: %s, skipped!' % (each_cutoff, len(trimmed_msa_list)))
514
+ else:
515
+ qualified_cutoff_list.append(each_cutoff)
516
+ pwd_concatenated_marker_phy = '%s/qualified_marker_PA_%s_concatenated.phy' % (trimmed_msa_PA_concatenated_dir, each_cutoff)
517
+ pwd_concatenated_marker_partition = '%s/qualified_marker_PA_%s_concatenated_partition.txt' % (trimmed_msa_PA_concatenated_dir, each_cutoff)
518
+ #catfasta2phyml_cmd = 'perl %s --sequential --concatenate %s/*.aln > %s 2> %s' % (catfasta2phyml_pl, current_cutoff_trimmed_msa_dir, pwd_concatenated_marker_phy, pwd_concatenated_marker_partition)
519
+ #print('running: ' + catfasta2phyml_cmd)
520
+ #os.system(catfasta2phyml_cmd)
521
+ catfasta2phy(current_cutoff_trimmed_msa_dir, 'aln', pwd_concatenated_marker_phy, pwd_concatenated_marker_partition)
522
+
523
+ ########## get guide tree and C60+PMSF tree for each set of marker set ##########
524
+
525
+ # create output dir
526
+ if os.path.isdir(iqtree_dir) is True:
527
+ os.system('rm -r %s' % iqtree_dir)
528
+ os.system('mkdir %s' % iqtree_dir)
529
+
530
+ current_dir = os.getcwd()
531
+ for each_qualified_cutoff in qualified_cutoff_list:
532
+
533
+ # create output dir
534
+ get_guide_tree_wd = '%s/PA_%s_guide_tree' % (iqtree_dir, each_qualified_cutoff)
535
+ get_c60_pmsf_tree_wd = '%s/PA_%s_C60_PMSF_tree' % (iqtree_dir, each_qualified_cutoff)
536
+ os.system('mkdir %s' % get_guide_tree_wd)
537
+ os.system('mkdir %s' % get_c60_pmsf_tree_wd)
538
+
539
+ # define file name
540
+ pwd_concatenated_marker_phy = '%s/%s/qualified_marker_PA_%s_concatenated.phy' % (os.getcwd(), trimmed_msa_PA_concatenated_dir, each_qualified_cutoff)
541
+ pwd_guide_tree = '%s/%s/guide_tree.treefile' % (os.getcwd(), get_guide_tree_wd)
542
+ get_guide_tree_cmd = 'iqtree -s %s --prefix %s/%s/guide_tree --seqtype AA -m LG -T %s -B 1000 --alrt 1000 --quiet' % (pwd_concatenated_marker_phy, os.getcwd(), get_guide_tree_wd, js_cpu_num)
543
+ get_c60_tree_cmd = 'iqtree -s %s --prefix %s/%s/concatenated --seqtype AA -m LG+G+F+C60 -T %s -B 1000 --alrt 1000 --quiet -ft %s' % (pwd_concatenated_marker_phy, os.getcwd(), get_c60_pmsf_tree_wd, js_cpu_num, pwd_guide_tree)
544
+
545
+ # write out job script
546
+ js_iqtree = '%s/js_iqtree_%s.sh' % (iqtree_dir, each_qualified_cutoff)
547
+ with open(js_iqtree, 'w') as js_iqtree_handle:
548
+ js_iqtree_handle.write('#!/bin/bash\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task %s\n' % js_cpu_num)
549
+ js_iqtree_handle.write('cd %s/%s\n' % (os.getcwd(), get_guide_tree_wd))
550
+ js_iqtree_handle.write('%s\n' % get_guide_tree_cmd)
551
+ js_iqtree_handle.write('cd %s/%s\n' % (os.getcwd(), get_c60_pmsf_tree_wd))
552
+ js_iqtree_handle.write('%s\n' % get_c60_tree_cmd)
553
+
554
+ # submit job script
555
+ print('Commands for running iqtree exported to: %s' % js_iqtree)
556
+ if submit_job is True:
557
+ os.chdir(iqtree_dir)
558
+ os.system('qsub js_iqtree_%s.sh' % each_qualified_cutoff)
559
+ os.chdir(current_dir)
560
+
561
+ ########## provide commands for running DeltaLL ##########
562
+
563
+ # create output dir
564
+ if os.path.isdir(deltall_dir) is True:
565
+ os.system('rm -r %s' % deltall_dir)
566
+ os.system('mkdir %s' % deltall_dir)
567
+
568
+ print('Suggested commands for running DeltaLL')
569
+ for each_qualified_cutoff in qualified_cutoff_list:
570
+ pwd_js = '%s/%s/js_deltall_PA%s.sh' % (os.getcwd(), deltall_dir, each_qualified_cutoff)
571
+ deltaLL_wd = '%s/%s/PA_%s' % (os.getcwd(), deltall_dir, each_qualified_cutoff)
572
+ msa_dir = '%s/%s/qualified_marker_PA_%s' % (os.getcwd(), assess_marker_pa_dir, each_qualified_cutoff)
573
+ pwd_tree_file = '%s/%s/PA_%s_C60_PMSF_tree/concatenated.treefile' % (os.getcwd(), iqtree_dir, each_qualified_cutoff)
574
+ deltaLL_stdout = 'PA_%s_stdout.txt' % (each_qualified_cutoff)
575
+ deltall_cmd = 'ruby %s --force --cpu %s -T %s --outdir %s --indir %s -t %s --outgrp_file %s --taxa %s > %s' % ('$deltaLL_rb', js_cpu_num, js_cpu_num, deltaLL_wd, msa_dir, pwd_tree_file, '$outgrp_file', '$taxa_list_txt', deltaLL_stdout)
576
+
577
+ with open(pwd_js, 'w') as pwd_js_handle:
578
+ pwd_js_handle.write('#!/bin/bash\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task %s\n\n' % js_cpu_num)
579
+ pwd_js_handle.write('source activate ruby\n')
580
+ pwd_js_handle.write('RUBYLIB=$RUBYLIB:/home-user/wzsong/Software/ruby_lib_sswang/\n')
581
+ pwd_js_handle.write('export RUBYLIB\n')
582
+ pwd_js_handle.write('cd %s/%s\n' % (os.getcwd(), deltall_dir))
583
+ pwd_js_handle.write('outgrp_file="deltaLL_outgroup.txt"\n')
584
+ pwd_js_handle.write('taxa_list_txt="deltaLL_eu_taxa_list.txt"\n')
585
+ pwd_js_handle.write('deltaLL_rb="/home-user/wzsong/Scripts/deltaLL.rb"\n')
586
+ pwd_js_handle.write(deltall_cmd + '\n')
587
+
588
+ ##########################################################
589
+
590
+ print('Done!')
591
+
592
+
593
+ if __name__ == '__main__':
594
+
595
+ # initialize the options parser
596
+ parser = argparse.ArgumentParser()
597
+ parser.add_argument('-m', required=True, help='marker seq dir')
598
+ parser.add_argument('-mx', required=True, help='marker seq ext')
599
+ parser.add_argument('-aa', required=True, help='faa file dir')
600
+ parser.add_argument('-aax', required=True, help='faa file ext')
601
+ parser.add_argument('-g', required=True, help='genome group')
602
+ parser.add_argument('-pac', required=False, default='0-50-75-100', help='cutoffs, default: 0-50-75-100')
603
+ parser.add_argument('-o', required=True, help='output dir')
604
+ parser.add_argument('-e', required=False, default='1e-30', help='e-value cutoff, default: 1e-30')
605
+ parser.add_argument('-t', required=True, type=int, help='num of threads')
606
+ parser.add_argument('-skip_align_trim', required=False, action="store_true", help='skip extracting, aligning and trimming markers')
607
+ parser.add_argument('-mmn', required=False, default=10, type=int, help='minimal marker number, default: 10')
608
+ parser.add_argument('-jst', required=False, default='6', help='threads to request in job script, for running iqtree')
609
+ parser.add_argument('-qsub', required=False, action="store_true", help='submit job scripts')
610
+ parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
611
+ args = vars(parser.parse_args())
612
+ MarkerRef2Tree(args)
613
+
614
+
615
+ '''
616
+
617
+ conda activate mypy3env
618
+ cd /home-user/wzsong/DateArTree
619
+ python3 MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 24 -pl /home-user/wzsong/Scripts/catfasta2phyml.pl
620
+ submitHPC.sh --cmd "python3 MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -g /home-user/wzsong/DateArTree/gnm_group.txt -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 24 -pl /home-user/wzsong/Scripts/catfasta2phyml.pl" -n 24 -c Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo
621
+
622
+ cd /home-user/wzsong/DateArTree
623
+ python3 MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa /home-user/wzsong/DateArTree/01_genome_selection_Prokka/d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -g /home-user/wzsong/DateArTree/gnm_group.txt -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 12 -pl /home-user/wzsong/Scripts/catfasta2phyml.pl -g gnm_group.txt -skip_align_trim -jst 6 -qsub
624
+
625
+ cd /Users/songweizhi/Desktop/demo
626
+ python3 /Users/songweizhi/PycharmProjects/TreeSAK/TreeSAK/MarkerRef2Tree.py -m Marker_set_2_Betts_2018_29_arCOG -mx fa -aa d__Archaea_o_rs_133_gnms_plus_27_mito_faa_files -aax faa -g gnm_group.txt -o Marker_set_2_Betts_2018_29_arCOG_Marker2Tree_e30_demo -e 30 -t 10 -pl /Users/songweizhi/Scripts/catfasta2phyml.pl -g gnm_group.txt -skip_align_trim -jst 6 -qsub
627
+
628
+ '''