treesak 1.53.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. TreeSAK/ALE.py +63 -0
  2. TreeSAK/ALE1.py +268 -0
  3. TreeSAK/ALE2.py +168 -0
  4. TreeSAK/ALE2RTC.py +30 -0
  5. TreeSAK/ALE3.py +205 -0
  6. TreeSAK/ALE4.py +636 -0
  7. TreeSAK/ALE5.py +210 -0
  8. TreeSAK/ALE6.py +401 -0
  9. TreeSAK/ALE7.py +126 -0
  10. TreeSAK/ALE_backup.py +1081 -0
  11. TreeSAK/AssessCVG.py +128 -0
  12. TreeSAK/AssessMarker.py +306 -0
  13. TreeSAK/AssessMarkerDeltaLL.py +257 -0
  14. TreeSAK/AssessMarkerPA.py +317 -0
  15. TreeSAK/AssessPB.py +113 -0
  16. TreeSAK/BMGE.jar +0 -0
  17. TreeSAK/BMGE.py +49 -0
  18. TreeSAK/C60SR4.nex +127 -0
  19. TreeSAK/CompareMCMC.py +138 -0
  20. TreeSAK/ConcateMSA.py +111 -0
  21. TreeSAK/ConvertMSA.py +135 -0
  22. TreeSAK/Dir.rb +82 -0
  23. TreeSAK/ExtractMarkerSeq.py +263 -0
  24. TreeSAK/FastRoot.py +1175 -0
  25. TreeSAK/FastRoot_backup.py +1122 -0
  26. TreeSAK/FigTree.py +34 -0
  27. TreeSAK/GTDB_tree.py +76 -0
  28. TreeSAK/GeneTree.py +142 -0
  29. TreeSAK/KEGG_Luo17.py +807 -0
  30. TreeSAK/LcaToLeaves.py +66 -0
  31. TreeSAK/MarkerRef2Tree.py +616 -0
  32. TreeSAK/MarkerRef2Tree_backup.py +628 -0
  33. TreeSAK/MarkerSeq2Tree.py +299 -0
  34. TreeSAK/MarkerSeq2Tree_backup.py +259 -0
  35. TreeSAK/ModifyTopo.py +116 -0
  36. TreeSAK/Newick_tree_plotter.py +79 -0
  37. TreeSAK/OMA.py +170 -0
  38. TreeSAK/OMA2.py +212 -0
  39. TreeSAK/OneLineAln.py +50 -0
  40. TreeSAK/PB.py +155 -0
  41. TreeSAK/PMSF.py +115 -0
  42. TreeSAK/PhyloBiAssoc.R +84 -0
  43. TreeSAK/PhyloBiAssoc.py +167 -0
  44. TreeSAK/PlotMCMC.py +41 -0
  45. TreeSAK/PlotMcmcNode.py +152 -0
  46. TreeSAK/PlotMcmcNode_old.py +252 -0
  47. TreeSAK/RootTree.py +101 -0
  48. TreeSAK/RootTreeGTDB.py +371 -0
  49. TreeSAK/RootTreeGTDB214.py +288 -0
  50. TreeSAK/RootTreeGTDB220.py +300 -0
  51. TreeSAK/SequentialDating.py +16 -0
  52. TreeSAK/SingleAleHGT.py +157 -0
  53. TreeSAK/SingleLinePhy.py +50 -0
  54. TreeSAK/SliceMSA.py +142 -0
  55. TreeSAK/SplitScore.py +21 -0
  56. TreeSAK/SplitScore1.py +177 -0
  57. TreeSAK/SplitScore1OMA.py +148 -0
  58. TreeSAK/SplitScore2.py +608 -0
  59. TreeSAK/TaxaCountStats.R +256 -0
  60. TreeSAK/TaxonTree.py +47 -0
  61. TreeSAK/TreeSAK_config.py +32 -0
  62. TreeSAK/VERSION +164 -0
  63. TreeSAK/VisHPD95.R +45 -0
  64. TreeSAK/VisHPD95.py +200 -0
  65. TreeSAK/__init__.py +0 -0
  66. TreeSAK/ale_parser.py +74 -0
  67. TreeSAK/ale_splitter.py +63 -0
  68. TreeSAK/alignment_pruner.pl +1471 -0
  69. TreeSAK/assessOG.py +45 -0
  70. TreeSAK/batch_itol.py +171 -0
  71. TreeSAK/catfasta2phy.py +140 -0
  72. TreeSAK/cogTree.py +185 -0
  73. TreeSAK/compare_trees.R +30 -0
  74. TreeSAK/compare_trees.py +255 -0
  75. TreeSAK/dating.py +264 -0
  76. TreeSAK/dating_ss.py +361 -0
  77. TreeSAK/deltall.py +82 -0
  78. TreeSAK/do_rrtc.rb +464 -0
  79. TreeSAK/fa2phy.py +42 -0
  80. TreeSAK/filter_rename_ar53.py +118 -0
  81. TreeSAK/format_leaf_name.py +70 -0
  82. TreeSAK/gap_stats.py +38 -0
  83. TreeSAK/get_SCG_tree.py +742 -0
  84. TreeSAK/get_arCOG_seq.py +97 -0
  85. TreeSAK/global_functions.py +222 -0
  86. TreeSAK/gnm_leaves.py +43 -0
  87. TreeSAK/iTOL.py +791 -0
  88. TreeSAK/iTOL_gene_tree.py +80 -0
  89. TreeSAK/itol_msa_stats.py +56 -0
  90. TreeSAK/keep_highest_rrtc.py +37 -0
  91. TreeSAK/koTree.py +194 -0
  92. TreeSAK/label_gene_tree_by_gnm.py +34 -0
  93. TreeSAK/label_tree.R +75 -0
  94. TreeSAK/label_tree.py +121 -0
  95. TreeSAK/mad.py +708 -0
  96. TreeSAK/mcmc2tree.py +58 -0
  97. TreeSAK/mcmcTC copy.py +92 -0
  98. TreeSAK/mcmcTC.py +104 -0
  99. TreeSAK/mcmctree_vs_reltime.R +44 -0
  100. TreeSAK/mcmctree_vs_reltime.py +252 -0
  101. TreeSAK/merge_pdf.py +32 -0
  102. TreeSAK/pRTC.py +56 -0
  103. TreeSAK/parse_mcmctree.py +198 -0
  104. TreeSAK/parse_reltime.py +141 -0
  105. TreeSAK/phy2fa.py +37 -0
  106. TreeSAK/plot_distruibution_th.py +165 -0
  107. TreeSAK/prep_mcmctree_ctl.py +92 -0
  108. TreeSAK/print_leaves.py +32 -0
  109. TreeSAK/pruneMSA.py +63 -0
  110. TreeSAK/recode.py +73 -0
  111. TreeSAK/remove_bias.R +112 -0
  112. TreeSAK/rename_leaves.py +78 -0
  113. TreeSAK/replace_clade.py +55 -0
  114. TreeSAK/root_with_out_group.py +84 -0
  115. TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
  116. TreeSAK/subsample_drep_gnms.py +74 -0
  117. TreeSAK/subset.py +69 -0
  118. TreeSAK/subset_tree_stupid_old_way.py +193 -0
  119. TreeSAK/supertree.py +330 -0
  120. TreeSAK/tmp_1.py +19 -0
  121. TreeSAK/tmp_2.py +19 -0
  122. TreeSAK/tmp_3.py +120 -0
  123. TreeSAK/tmp_4.py +43 -0
  124. TreeSAK/tmp_5.py +12 -0
  125. TreeSAK/weighted_rand.rb +23 -0
  126. treesak-1.53.3.data/scripts/TreeSAK +955 -0
  127. treesak-1.53.3.dist-info/LICENSE +674 -0
  128. treesak-1.53.3.dist-info/METADATA +27 -0
  129. treesak-1.53.3.dist-info/RECORD +131 -0
  130. treesak-1.53.3.dist-info/WHEEL +5 -0
  131. treesak-1.53.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,742 @@
1
+ #!/usr/bin/env python
2
+ from __future__ import division
3
+ import os
4
+ import re
5
+ import glob
6
+ import shutil
7
+ import argparse
8
+ import warnings
9
+ from datetime import datetime
10
+ from Bio import SeqIO, AlignIO, Align
11
+ from Bio.Seq import Seq
12
+ from Bio.Alphabet import IUPAC, generic_dna
13
+ from Bio import SeqFeature as SF
14
+ from Bio.SeqRecord import SeqRecord
15
+ from Bio.SeqFeature import SeqFeature, FeatureLocation
16
+ import multiprocessing as mp
17
+ from BioSAK.BioSAK_config import config_dict
18
+
19
+ get_SCG_tree_usage = '''
20
+ ===================================== get SCG tree example commands =====================================
21
+
22
+ # for completed genome
23
+ BioSAK get_SCG_tree -i genomes -p NorthSea -x fasta -t 4 -nonmeta
24
+
25
+ # for metagenome-assembled genomes (MAGs)
26
+ BioSAK get_SCG_tree -i genomes -p NorthSea -x fasta -t 4
27
+
28
+ Software dependencies:
29
+ module load hmmer/3.2.1
30
+ module load mafft/7.407
31
+ module load fasttree/2.1.10
32
+ module load R/3.5.3
33
+ module load blast+/2.9.0
34
+ module load prodigal/2.6.3
35
+
36
+ =========================================================================================================
37
+ '''
38
+
39
+
40
+ def report_and_log(message_for_report, log_file, keep_quiet):
41
+
42
+ time_format = '[%Y-%m-%d %H:%M:%S]'
43
+ with open(log_file, 'a') as log_handle:
44
+ log_handle.write('%s %s\n' % ((datetime.now().strftime(time_format)), message_for_report))
45
+
46
+ if keep_quiet is False:
47
+ print('%s %s' % ((datetime.now().strftime(time_format)), message_for_report))
48
+
49
+
50
+ def force_create_folder(folder_to_create):
51
+ if os.path.isdir(folder_to_create):
52
+ shutil.rmtree(folder_to_create, ignore_errors=True)
53
+ if os.path.isdir(folder_to_create):
54
+ shutil.rmtree(folder_to_create, ignore_errors=True)
55
+ if os.path.isdir(folder_to_create):
56
+ shutil.rmtree(folder_to_create, ignore_errors=True)
57
+ if os.path.isdir(folder_to_create):
58
+ shutil.rmtree(folder_to_create, ignore_errors=True)
59
+ os.mkdir(folder_to_create)
60
+
61
+
62
+ def remove_empty_element(list_in):
63
+
64
+ list_out = []
65
+ for each_element in list_in:
66
+ if each_element != '':
67
+ list_out.append(each_element)
68
+
69
+ return list_out
70
+
71
+
72
+ def get_program_path_dict(pwd_cfg_file):
73
+ program_path_dict = {}
74
+ for each in open(pwd_cfg_file):
75
+ each_split = each.strip().split('=')
76
+ program_name = each_split[0]
77
+ program_path = each_split[1]
78
+
79
+ # remove space if there are
80
+ if program_name[-1] == ' ':
81
+ program_name = program_name[:-1]
82
+ if program_path[0] == ' ':
83
+ program_path = program_path[1:]
84
+ program_path_dict[program_name] = program_path
85
+
86
+ return program_path_dict
87
+
88
+
89
+ def export_dna_record(gene_seq, gene_id, gene_description, output_handle):
90
+ seq_object = Seq(gene_seq, IUPAC.unambiguous_dna)
91
+ seq_record = SeqRecord(seq_object)
92
+ seq_record.id = gene_id
93
+ seq_record.description = gene_description
94
+ SeqIO.write(seq_record, output_handle, 'fasta')
95
+
96
+
97
+ def export_aa_record(gene_seq, gene_id, gene_description, output_handle):
98
+ seq_object = Seq(gene_seq, IUPAC.protein)
99
+ seq_record = SeqRecord(seq_object)
100
+ seq_record.id = gene_id
101
+ seq_record.description = gene_description
102
+ SeqIO.write(seq_record, output_handle, 'fasta')
103
+
104
+
105
+ def remove_low_cov_and_consensus_columns(alignment_file_in, minimal_cov, min_consensus, alignment_file_out):
106
+
107
+ def list_to_segments(list_in):
108
+
109
+ segments_out = []
110
+ current_element = None
111
+ current_segment = [None, None]
112
+ for each_element in list_in:
113
+
114
+ # for the first ellment
115
+ if current_element == None:
116
+ current_element = each_element
117
+ current_segment = [each_element, each_element]
118
+
119
+ elif each_element == current_element + 1:
120
+ current_segment[1] = each_element
121
+ current_element = each_element
122
+
123
+ elif each_element != current_element + 1:
124
+
125
+ # add segment to list
126
+ segments_out.append(current_segment)
127
+
128
+ # resetting segment
129
+ current_segment = [each_element, each_element]
130
+ current_element = each_element
131
+
132
+ # add segment to list
133
+ segments_out.append(current_segment)
134
+
135
+ return segments_out
136
+
137
+ def remove_columns_from_msa(alignment_in, cols_to_remove):
138
+
139
+ # get 0 based index of all wanted columns
140
+ cols_to_remove_0_base = [(i - 1) for i in cols_to_remove]
141
+ aln_cols_index_all = list(range(alignment_in.get_alignment_length()))
142
+ aln_cols_index_wanted = []
143
+ for i in aln_cols_index_all:
144
+ if i not in cols_to_remove_0_base:
145
+ aln_cols_index_wanted.append(i)
146
+
147
+ # get wanted alignment segments
148
+ wanted_segments = list_to_segments(aln_cols_index_wanted)
149
+
150
+ # create an empty Alignment object
151
+ alignment_new = Align.MultipleSeqAlignment([])
152
+ for sequence in alignment_in:
153
+ new_seq_object = Seq('')
154
+ new_seq_record = SeqRecord(new_seq_object)
155
+ new_seq_record.id = sequence.id
156
+ new_seq_record.description = sequence.description
157
+ alignment_new.append(new_seq_record)
158
+
159
+ # add wanted columns to empty Alignment object
160
+ for segment in wanted_segments:
161
+
162
+ # for single column segment
163
+ if segment[0] == segment[1]:
164
+ segment_value = alignment_in[:, segment[0]]
165
+
166
+ m = 0
167
+ for each_seq in alignment_new:
168
+ each_seq.seq = Seq(str(each_seq.seq) + segment_value[m])
169
+ m += 1
170
+
171
+ # for multiple columns segment
172
+ else:
173
+ segment_value = alignment_in[:, (segment[0]):(segment[1] + 1)]
174
+ alignment_new += segment_value
175
+
176
+ return alignment_new
177
+
178
+ def remove_low_cov_columns(alignment_in, min_cov_cutoff):
179
+
180
+ # get columns with low coverage
181
+ sequence_number = len(alignment_in)
182
+ total_col_num = alignment_in.get_alignment_length()
183
+ low_cov_columns = []
184
+ n = 0
185
+ while n < total_col_num:
186
+ current_column = alignment_in[:, n]
187
+ dash_number = current_column.count('-')
188
+ gap_percent = (dash_number / sequence_number) * 100
189
+
190
+ if gap_percent > min_cov_cutoff:
191
+ low_cov_columns.append(n + 1)
192
+
193
+ n += 1
194
+
195
+ # remove identified columns
196
+ alignment_new = remove_columns_from_msa(alignment_in, low_cov_columns)
197
+
198
+ return alignment_new
199
+
200
+ def remove_low_consensus_columns(alignment_in, min_css_cutoff):
201
+
202
+ # get columns with low coverage
203
+ sequence_number = len(alignment_in)
204
+ total_col_num = alignment_in.get_alignment_length()
205
+ low_css_columns = []
206
+ n = 0
207
+ while n < total_col_num:
208
+ current_column = alignment_in[:, n]
209
+
210
+ # get all aa in current column
211
+ aa_list = set()
212
+ for aa in current_column:
213
+ aa_list.add(aa)
214
+
215
+ # get maximum aa percent
216
+ most_abundant_aa_percent = 0
217
+ for each_aa in aa_list:
218
+ each_aa_percent = (current_column.count(each_aa) / sequence_number) * 100
219
+ if each_aa_percent > most_abundant_aa_percent:
220
+ most_abundant_aa_percent = each_aa_percent
221
+
222
+ # if maximum percent lower than provided cutoff, add current column to low consensus column list
223
+ if most_abundant_aa_percent < min_css_cutoff:
224
+ low_css_columns.append(n + 1)
225
+
226
+ n += 1
227
+
228
+ # remove identified columns
229
+ alignment_new = remove_columns_from_msa(alignment_in, low_css_columns)
230
+
231
+ return alignment_new
232
+
233
+ # read in alignment
234
+ alignment = AlignIO.read(alignment_file_in, "fasta")
235
+
236
+ # remove_low_cov_columns
237
+ alignment_cov = remove_low_cov_columns(alignment, minimal_cov)
238
+
239
+ # remove_low_consensus_columns
240
+ alignment_cov_css = remove_low_consensus_columns(alignment_cov, min_consensus)
241
+
242
+ # write filtered alignment
243
+ alignment_file_out_handle = open(alignment_file_out, 'w')
244
+ for each_seq in alignment_cov_css:
245
+ alignment_file_out_handle.write('>%s\n' % str(each_seq.id))
246
+ alignment_file_out_handle.write('%s\n' % str(each_seq.seq))
247
+ alignment_file_out_handle.close()
248
+
249
+
250
+ def prodigal_parser(seq_file, sco_file, prefix, output_folder):
251
+
252
+ bin_ffn_file = '%s.ffn' % prefix
253
+ bin_faa_file = '%s.faa' % prefix
254
+ pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file)
255
+ pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file)
256
+
257
+ # get sequence id list
258
+ id_to_sequence_dict = {}
259
+ sequence_id_list = []
260
+ for each_seq in SeqIO.parse(seq_file, 'fasta'):
261
+ id_to_sequence_dict[each_seq.id] = str(each_seq.seq)
262
+ sequence_id_list.append(each_seq.id)
263
+
264
+
265
+ # get sequence to cds dict and sequence to transl_table dict
266
+ current_seq_id = ''
267
+ current_transl_table = ''
268
+ current_seq_csd_list = []
269
+ seq_to_cds_dict = {}
270
+ seq_to_transl_table_dict = {}
271
+ for each_cds in open(sco_file):
272
+ if each_cds.startswith('# Sequence Data'):
273
+
274
+ # add to dict
275
+ if current_seq_id != '':
276
+ seq_to_cds_dict[current_seq_id] = current_seq_csd_list
277
+ seq_to_transl_table_dict[current_seq_id] = current_transl_table
278
+
279
+ # reset value
280
+ current_seq_id = each_cds.strip().split(';seqhdr=')[1][1:-1].split(' ')[0]
281
+ current_transl_table = ''
282
+ current_seq_csd_list = []
283
+
284
+ elif each_cds.startswith('# Model Data'):
285
+ current_transl_table = each_cds.strip().split(';')[-2].split('=')[-1]
286
+
287
+ else:
288
+ current_seq_csd_list.append('_'.join(each_cds.strip().split('_')[1:]))
289
+
290
+ seq_to_cds_dict[current_seq_id] = current_seq_csd_list
291
+ seq_to_transl_table_dict[current_seq_id] = current_transl_table
292
+
293
+
294
+ bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w')
295
+ bin_faa_file_handle = open(pwd_bin_faa_file, 'w')
296
+ gene_index = 1
297
+ for seq_id in sequence_id_list:
298
+
299
+ # create SeqRecord
300
+ current_sequence = Seq(id_to_sequence_dict[seq_id])
301
+ current_SeqRecord = SeqRecord(current_sequence, id=seq_id)
302
+ current_SeqRecord.seq.alphabet = generic_dna
303
+ transl_table = seq_to_transl_table_dict[seq_id]
304
+
305
+ # add SeqFeature to SeqRecord
306
+ for cds in seq_to_cds_dict[seq_id]:
307
+
308
+ # define locus_tag id
309
+ locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index))
310
+
311
+ # define FeatureLocation
312
+ cds_split = cds.split('_')
313
+ cds_start = SF.ExactPosition(int(cds_split[0]))
314
+ cds_end = SF.ExactPosition(int(cds_split[1]))
315
+ cds_strand = cds_split[2]
316
+ current_strand = None
317
+ if cds_strand == '+':
318
+ current_strand = 1
319
+ if cds_strand == '-':
320
+ current_strand = -1
321
+ current_feature_location = FeatureLocation(cds_start, cds_end, strand=current_strand)
322
+
323
+ # get nc sequence
324
+ sequence_nc = ''
325
+ if cds_strand == '+':
326
+ sequence_nc = id_to_sequence_dict[seq_id][cds_start-1:cds_end]
327
+ if cds_strand == '-':
328
+ sequence_nc = str(Seq(id_to_sequence_dict[seq_id][cds_start-1:cds_end], generic_dna).reverse_complement())
329
+
330
+ # translate to aa sequence
331
+ sequence_aa = str(SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table))
332
+
333
+ # remove * at the end
334
+ sequence_aa = sequence_aa[:-1]
335
+
336
+ # export nc and aa sequences
337
+ export_dna_record(sequence_nc, locus_tag_id, '', bin_ffn_file_handle)
338
+ export_aa_record(sequence_aa, locus_tag_id, '', bin_faa_file_handle)
339
+
340
+ # Define feature type
341
+ current_feature_type = 'CDS'
342
+
343
+ # Define feature qualifiers
344
+ current_qualifiers_dict = {}
345
+ current_qualifiers_dict['locus_tag'] = locus_tag_id
346
+ current_qualifiers_dict['transl_table'] = transl_table
347
+ current_qualifiers_dict['translation'] = sequence_aa
348
+
349
+ # Create a SeqFeature
350
+ current_feature = SeqFeature(current_feature_location, type=current_feature_type, qualifiers=current_qualifiers_dict)
351
+
352
+ # Append Feature to SeqRecord
353
+ current_SeqRecord.features.append(current_feature)
354
+ gene_index += 1
355
+
356
+ bin_ffn_file_handle.close()
357
+ bin_faa_file_handle.close()
358
+
359
+
360
+ def sep_combined_hmm(combined_hmm_file, hmm_profile_sep_folder, hmmfetch_exe, pwd_hmmstat_exe):
361
+
362
+ # extract hmm profile id from phylo.hmm
363
+ pwd_phylo_hmm_stat_txt = '%s/phylo.hmm.stat.txt' % hmm_profile_sep_folder
364
+ hmmstat_cmd = '%s %s > %s' % (pwd_hmmstat_exe, combined_hmm_file, pwd_phylo_hmm_stat_txt)
365
+ os.system(hmmstat_cmd)
366
+
367
+ # get hmm profile id file
368
+ hmm_id_list = []
369
+ for each_profile in open(pwd_phylo_hmm_stat_txt):
370
+ if not each_profile.startswith('#'):
371
+ each_profile_split = each_profile.strip().split(' ')
372
+ if each_profile_split != ['']:
373
+ each_profile_split_no_space = []
374
+ for each_element in each_profile_split:
375
+ if each_element != '':
376
+ each_profile_split_no_space.append(each_element)
377
+ hmm_id_list.append(each_profile_split_no_space[2])
378
+
379
+ for each_hmm_id in hmm_id_list:
380
+ hmmfetch_cmd = '%s %s %s > %s/%s.hmm' % (hmmfetch_exe, combined_hmm_file, each_hmm_id, hmm_profile_sep_folder, each_hmm_id)
381
+ os.system(hmmfetch_cmd)
382
+
383
+
384
+ def prodigal_worker(argument_list):
385
+
386
+ input_genome = argument_list[0]
387
+ input_genome_folder = argument_list[1]
388
+ pwd_prodigal_exe = argument_list[2]
389
+ nonmeta_mode = argument_list[3]
390
+ pwd_prodigal_output_folder = argument_list[4]
391
+
392
+ # prepare command (according to Prokka)
393
+ input_genome_basename, input_genome_ext = os.path.splitext(input_genome)
394
+ pwd_input_genome = '%s/%s' % (input_genome_folder, input_genome)
395
+ pwd_output_sco = '%s/%s.sco' % (pwd_prodigal_output_folder, input_genome_basename)
396
+
397
+ prodigal_cmd_meta = '%s -f sco -q -c -m -g 11 -p meta -i %s -o %s' % (
398
+ pwd_prodigal_exe, pwd_input_genome, pwd_output_sco)
399
+ prodigal_cmd_nonmeta = '%s -f sco -q -c -m -g 11 -i %s -o %s' % (
400
+ pwd_prodigal_exe, pwd_input_genome, pwd_output_sco)
401
+
402
+ if nonmeta_mode is True:
403
+ prodigal_cmd = prodigal_cmd_nonmeta
404
+ else:
405
+ prodigal_cmd = prodigal_cmd_meta
406
+
407
+ os.system(prodigal_cmd)
408
+
409
+ # prepare ffn, faa and gbk files from prodigal output
410
+ prodigal_parser(pwd_input_genome, pwd_output_sco, input_genome_basename, pwd_prodigal_output_folder)
411
+
412
+
413
+ def hmmsearch_worker(argument_list):
414
+
415
+ faa_file_basename = argument_list[0]
416
+ pwd_SCG_tree_wd = argument_list[1]
417
+ pwd_hmmsearch_exe = argument_list[2]
418
+ path_to_hmm = argument_list[3]
419
+ pwd_faa_folder = argument_list[4]
420
+
421
+ # run hmmsearch
422
+ pwd_faa_file = '%s/%s.faa' % (pwd_faa_folder, faa_file_basename)
423
+ os.system('%s -o /dev/null --domtblout %s/%s_hmmout.tbl %s %s' % (pwd_hmmsearch_exe, pwd_SCG_tree_wd, faa_file_basename, path_to_hmm, pwd_faa_file))
424
+
425
+ # Reading the protein file in a dictionary
426
+ proteinSequence = {}
427
+ for seq_record in SeqIO.parse(pwd_faa_file, 'fasta'):
428
+ proteinSequence[seq_record.id] = str(seq_record.seq)
429
+
430
+ # Reading the hmmersearch table/extracting the protein part found beu hmmsearch out of the protein/Writing
431
+ # each protein sequence that was extracted to a fasta file (one for each hmm in phylo.hmm
432
+ hmm_id = ''
433
+ hmm_name = ''
434
+ hmm_pos1 = 0
435
+ hmm_pos2 = 0
436
+ hmm_score = 0
437
+ pwd_hmmout_tbl = pwd_SCG_tree_wd + '/' + faa_file_basename + '_hmmout.tbl'
438
+ with open(pwd_hmmout_tbl, 'r') as tbl:
439
+ for line in tbl:
440
+ if line[0] == "#": continue
441
+ line = re.sub('\s+', ' ', line)
442
+ splitLine = line.split(' ')
443
+
444
+ if (hmm_id == ''):
445
+ hmm_id = splitLine[4]
446
+ hmm_name = splitLine[0]
447
+ hmm_pos1 = int(splitLine[17]) - 1
448
+ hmm_pos2 = int(splitLine[18])
449
+ hmm_score = float(splitLine[13])
450
+ elif (hmm_id == splitLine[4]):
451
+ if (float(splitLine[13]) > hmm_score):
452
+ hmm_name = splitLine[0]
453
+ hmm_pos1 = int(splitLine[17]) - 1
454
+ hmm_pos2 = int(splitLine[18])
455
+ hmm_score = float(splitLine[13])
456
+ else:
457
+ file_out = open(pwd_SCG_tree_wd + '/' + hmm_id + '.fasta', 'a+')
458
+ file_out.write('>' + faa_file_basename + '\n')
459
+ if hmm_name != '':
460
+ seq = str(proteinSequence[hmm_name][hmm_pos1:hmm_pos2])
461
+ file_out.write(str(seq) + '\n')
462
+ file_out.close()
463
+ hmm_id = splitLine[4]
464
+ hmm_name = splitLine[0]
465
+ hmm_pos1 = int(splitLine[17]) - 1
466
+ hmm_pos2 = int(splitLine[18])
467
+ hmm_score = float(splitLine[13])
468
+
469
+ else:
470
+ file_out = open(pwd_SCG_tree_wd + '/' + hmm_id + '.fasta', 'a+')
471
+ file_out.write('>' + faa_file_basename + '\n')
472
+ if hmm_name != '':
473
+ seq = str(proteinSequence[hmm_name][hmm_pos1:hmm_pos2])
474
+ file_out.write(str(seq) + '\n')
475
+ file_out.close()
476
+
477
+
478
+ def convert_hmmalign_output(align_in, align_out):
479
+
480
+ # read in alignment
481
+ sequence_id_list = []
482
+ sequence_seq_dict = {}
483
+ for aligned_seq in open(align_in):
484
+ aligned_seq_split = aligned_seq.strip().split(' ')
485
+ aligned_seq_split = remove_empty_element(aligned_seq_split)
486
+
487
+ if aligned_seq_split != []:
488
+ aligned_seq_id = aligned_seq_split[0]
489
+ aligned_seq_seq = aligned_seq_split[1]
490
+
491
+ # add id to sequence id list
492
+ if aligned_seq_id not in sequence_id_list:
493
+ sequence_id_list.append(aligned_seq_id)
494
+
495
+ # add seq to sequence seq dict
496
+ if aligned_seq_id not in sequence_seq_dict:
497
+ sequence_seq_dict[aligned_seq_id] = aligned_seq_seq
498
+ else:
499
+ sequence_seq_dict[aligned_seq_id] += aligned_seq_seq
500
+
501
+ # write out
502
+ align_out_handle = open(align_out, 'w')
503
+ for sequence_id in sequence_id_list:
504
+ sequence_seq = sequence_seq_dict[sequence_id]
505
+ align_out_handle.write('>%s\n' % sequence_id)
506
+ align_out_handle.write('%s\n' % sequence_seq)
507
+ align_out_handle.close()
508
+
509
+
510
+ def hmmalign_worker(argument_list):
511
+ fastaFile_basename = argument_list[0]
512
+ pwd_SCG_tree_wd = argument_list[1]
513
+ pwd_hmm_profile_folder = argument_list[2]
514
+ pwd_hmmalign_exe = argument_list[3]
515
+
516
+ pwd_hmm_file = '%s/%s.hmm' % (pwd_hmm_profile_folder, fastaFile_basename)
517
+ pwd_seq_in = '%s/%s.fasta' % (pwd_SCG_tree_wd, fastaFile_basename)
518
+ pwd_aln_out_tmp = '%s/%s_aligned_tmp.fasta' % (pwd_SCG_tree_wd, fastaFile_basename)
519
+ pwd_aln_out = '%s/%s_aligned.fasta' % (pwd_SCG_tree_wd, fastaFile_basename)
520
+
521
+ hmmalign_cmd = '%s --trim --outformat PSIBLAST %s %s > %s ; rm %s' % (pwd_hmmalign_exe, pwd_hmm_file, pwd_seq_in, pwd_aln_out_tmp, pwd_seq_in)
522
+ os.system(hmmalign_cmd)
523
+
524
+ # convert alignment format
525
+ convert_hmmalign_output(pwd_aln_out_tmp, pwd_aln_out)
526
+
527
+ # remove tmp alignment
528
+ os.system('rm %s' % pwd_aln_out_tmp)
529
+
530
+
531
+ def get_SCG_tree(args, config_dict):
532
+
533
+ # read in arguments
534
+ input_genome_folder = args['i']
535
+ output_prefix = args['p']
536
+ file_extension = args['x']
537
+ num_threads = args['t']
538
+ nonmeta_mode = args['nonmeta']
539
+
540
+ # read in config file
541
+ path_to_hmm = config_dict['path_to_hmm']
542
+ pwd_prodigal_exe = config_dict['prodigal']
543
+ pwd_hmmsearch_exe = config_dict['hmmsearch']
544
+ pwd_hmmfetch_exe = config_dict['hmmfetch']
545
+ pwd_hmmalign_exe = config_dict['hmmalign']
546
+ pwd_hmmstat_exe = config_dict['hmmstat']
547
+ pwd_fasttree_exe = config_dict['fasttree']
548
+
549
+ warnings.filterwarnings("ignore")
550
+ minimal_cov_in_msa = 50
551
+ min_consensus_in_msa = 25
552
+ keep_quiet = False
553
+
554
+
555
+ #################################################### check input ###################################################
556
+
557
+ # check whether input genome exist
558
+ input_genome_file_re = '%s/*.%s' % (input_genome_folder, file_extension)
559
+ input_genome_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_genome_file_re)]
560
+ if input_genome_file_name_list == []:
561
+ print('No input genome detected, program exited!')
562
+ exit()
563
+
564
+
565
+ ############################################# define file/folder names #############################################
566
+
567
+ get_SCG_tree_wd = '%s_get_SCG_tree_wd' % (output_prefix)
568
+ prodigal_output_folder = '%s_1_prodigal_output' % (output_prefix)
569
+ extract_and_align_SCG_wd = '%s_2_extract_and_align_SCGs' % (output_prefix)
570
+ combined_alignment_file_tmp = '%s_SCG_tree.aln' % (output_prefix)
571
+ combined_alignment_file = '%s_SCG_tree_cov%s_css%s.aln' % (output_prefix, minimal_cov_in_msa, min_consensus_in_msa)
572
+ newick_tree_file = '%s_SCG_tree.newick' % (output_prefix)
573
+ hmm_profile_sep_folder = '%s_hmm_profile_fetched' % (output_prefix)
574
+
575
+ pwd_log_file = '%s/%s_get_SCG_tree.log' % (get_SCG_tree_wd, output_prefix)
576
+ pwd_prodigal_output_folder = '%s/%s' % (get_SCG_tree_wd, prodigal_output_folder)
577
+ pwd_extract_and_align_SCG_wd = '%s/%s' % (get_SCG_tree_wd, extract_and_align_SCG_wd)
578
+ pwd_combined_alignment_file_tmp = '%s/%s' % (get_SCG_tree_wd, combined_alignment_file_tmp)
579
+ pwd_combined_alignment_file = '%s/%s' % (get_SCG_tree_wd, combined_alignment_file)
580
+ pwd_hmm_profile_sep_folder = '%s/%s/%s' % (get_SCG_tree_wd, extract_and_align_SCG_wd, hmm_profile_sep_folder)
581
+ pwd_newick_tree_file = '%s/%s' % (get_SCG_tree_wd, newick_tree_file)
582
+
583
+
584
+ # create wd
585
+ force_create_folder(get_SCG_tree_wd)
586
+
587
+
588
+ ######################################## run prodigal with multiprocessing #########################################
589
+
590
+ # for report and log
591
+ report_and_log(('Running Prodigal with %s cores for input genomes' % num_threads), pwd_log_file, keep_quiet)
592
+
593
+ # create prodigal output folder
594
+ force_create_folder(pwd_prodigal_output_folder)
595
+
596
+ # get input genome list
597
+ input_genome_file_re = '%s/*.%s' % (input_genome_folder, file_extension)
598
+ input_genome_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_genome_file_re)]
599
+
600
+ # prepare arguments for prodigal_worker
601
+ list_for_multiple_arguments_Prodigal = []
602
+ for input_genome in input_genome_file_name_list:
603
+ list_for_multiple_arguments_Prodigal.append([input_genome, input_genome_folder, pwd_prodigal_exe, nonmeta_mode, pwd_prodigal_output_folder])
604
+
605
+ # run prodigal with multiprocessing
606
+ pool = mp.Pool(processes=num_threads)
607
+ pool.map(prodigal_worker, list_for_multiple_arguments_Prodigal)
608
+ pool.close()
609
+ pool.join()
610
+
611
+
612
+ ########################################### get species tree (hmmsearch) ###########################################
613
+
614
+ # create wd
615
+ force_create_folder(pwd_extract_and_align_SCG_wd)
616
+
617
+ # for report and log
618
+ report_and_log(('Running Hmmsearch with %s cores' % num_threads), pwd_log_file, keep_quiet)
619
+
620
+ faa_file_re = '%s/*.faa' % pwd_prodigal_output_folder
621
+ faa_file_list = [os.path.basename(file_name) for file_name in glob.glob(faa_file_re)]
622
+ faa_file_list = sorted(faa_file_list)
623
+
624
+ faa_file_basename_list = []
625
+ for faa_file in faa_file_list:
626
+ faa_file_basename, faa_file_extension = os.path.splitext(faa_file)
627
+ faa_file_basename_list.append(faa_file_basename)
628
+
629
+ # prepare arguments for hmmsearch_worker
630
+ list_for_multiple_arguments_hmmsearch = []
631
+ for faa_file_basename in faa_file_basename_list:
632
+ list_for_multiple_arguments_hmmsearch.append([faa_file_basename, pwd_extract_and_align_SCG_wd, pwd_hmmsearch_exe, path_to_hmm, pwd_prodigal_output_folder])
633
+
634
+ # run hmmsearch with multiprocessing
635
+ pool = mp.Pool(processes=num_threads)
636
+ pool.map(hmmsearch_worker, list_for_multiple_arguments_hmmsearch)
637
+ pool.close()
638
+ pool.join()
639
+
640
+
641
+ ############################################# get species tree (hmmalign) #############################################
642
+
643
+ # for report and log
644
+ report_and_log(('Running Hmmalign with %s cores' % num_threads), pwd_log_file, keep_quiet)
645
+
646
+ # fetch combined hmm profiles
647
+ force_create_folder(pwd_hmm_profile_sep_folder)
648
+ sep_combined_hmm(path_to_hmm, pwd_hmm_profile_sep_folder, pwd_hmmfetch_exe, pwd_hmmstat_exe)
649
+
650
+ # Call hmmalign to align all single fasta files with hmms
651
+ files = os.listdir(pwd_extract_and_align_SCG_wd)
652
+ fastaFiles = [i for i in files if i.endswith('.fasta')]
653
+
654
+ # prepare arguments for hmmalign_worker
655
+ list_for_multiple_arguments_hmmalign = []
656
+ for fastaFile in fastaFiles:
657
+
658
+ fastaFiles_basename = '.'.join(fastaFile.split('.')[:-1])
659
+ list_for_multiple_arguments_hmmalign.append([fastaFiles_basename, pwd_extract_and_align_SCG_wd, pwd_hmm_profile_sep_folder, pwd_hmmalign_exe])
660
+
661
+ # run hmmalign with multiprocessing
662
+ pool = mp.Pool(processes=num_threads)
663
+ pool.map(hmmalign_worker, list_for_multiple_arguments_hmmalign)
664
+ pool.close()
665
+ pool.join()
666
+
667
+
668
+ ################################### get species tree (Concatenating alignments) ####################################
669
+
670
+ # for report and log
671
+ report_and_log('Concatenating alignments', pwd_log_file, keep_quiet)
672
+
673
+ # concatenating the single alignments
674
+ concatAlignment = {}
675
+ for element in faa_file_basename_list:
676
+ concatAlignment[element] = ''
677
+
678
+ # Reading all single alignment files and append them to the concatenated alignment
679
+ files = os.listdir(pwd_extract_and_align_SCG_wd)
680
+ fastaFiles = [i for i in files if i.endswith('.fasta')]
681
+ for faa_file_basename in fastaFiles:
682
+ fastaFile = pwd_extract_and_align_SCG_wd + '/' + faa_file_basename
683
+ proteinSequence = {}
684
+ alignmentLength = 0
685
+ for seq_record_2 in SeqIO.parse(fastaFile, 'fasta'):
686
+ proteinName = seq_record_2.id
687
+ proteinSequence[proteinName] = str(seq_record_2.seq)
688
+ alignmentLength = len(proteinSequence[proteinName])
689
+
690
+ for element in faa_file_basename_list:
691
+ if element in proteinSequence.keys():
692
+ concatAlignment[element] += proteinSequence[element]
693
+ else:
694
+ concatAlignment[element] += '-' * alignmentLength
695
+
696
+ # writing alignment to file
697
+ file_out = open(pwd_combined_alignment_file_tmp, 'w')
698
+ for element in faa_file_basename_list:
699
+ file_out.write('>' + element + '\n' + concatAlignment[element] + '\n')
700
+ file_out.close()
701
+
702
+ # remove columns with low coverage and low consensus
703
+ report_and_log(('Removing columns from concatenated alignment represented by <%s%s of genomes and with an amino acid consensus <%s%s' % (minimal_cov_in_msa, '%', min_consensus_in_msa, '%')), pwd_log_file, keep_quiet)
704
+ remove_low_cov_and_consensus_columns(pwd_combined_alignment_file_tmp, minimal_cov_in_msa, min_consensus_in_msa, pwd_combined_alignment_file)
705
+
706
+
707
+ ########################################### get species tree (fasttree) ############################################
708
+
709
+ # for report and log
710
+ report_and_log('Running FastTree', pwd_log_file, keep_quiet)
711
+
712
+ # calling fasttree for tree calculation
713
+ fasttree_cmd = '%s -quiet %s > %s' % (pwd_fasttree_exe, pwd_combined_alignment_file, pwd_newick_tree_file)
714
+ os.system(fasttree_cmd)
715
+
716
+ # for report and log
717
+ report_and_log(('SCG tree exported to: %s' % newick_tree_file), pwd_log_file, keep_quiet)
718
+
719
+
720
+ ############################################## remove temporary files ##############################################
721
+
722
+ # remove temporary files
723
+ report_and_log(('Deleting temporary files'), pwd_log_file, keep_quiet)
724
+
725
+ os.system('rm -r %s' % pwd_combined_alignment_file_tmp)
726
+
727
+
728
+ if __name__ == '__main__':
729
+
730
+ # initialize the options parser
731
+ parser = argparse.ArgumentParser()
732
+
733
+ # arguments for PI
734
+ parser.add_argument('-i', required=True, help='input genome folder')
735
+ parser.add_argument('-p', required=True, help='output prefix')
736
+ parser.add_argument('-x', required=False, default='fasta', help='file extension')
737
+ parser.add_argument('-nonmeta', required=False, action="store_true", help='annotate Non-metagenome-assembled genomes (Non-MAGs)')
738
+ parser.add_argument('-t', required=False, type=int, default=1, help='number of threads, default: 1')
739
+
740
+ args = vars(parser.parse_args())
741
+
742
+ get_SCG_tree(args, config_dict)