treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/get_SCG_tree.py
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
from __future__ import division
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import glob
|
|
6
|
+
import shutil
|
|
7
|
+
import argparse
|
|
8
|
+
import warnings
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from Bio import SeqIO, AlignIO, Align
|
|
11
|
+
from Bio.Seq import Seq
|
|
12
|
+
from Bio.Alphabet import IUPAC, generic_dna
|
|
13
|
+
from Bio import SeqFeature as SF
|
|
14
|
+
from Bio.SeqRecord import SeqRecord
|
|
15
|
+
from Bio.SeqFeature import SeqFeature, FeatureLocation
|
|
16
|
+
import multiprocessing as mp
|
|
17
|
+
from BioSAK.BioSAK_config import config_dict
|
|
18
|
+
|
|
19
|
+
get_SCG_tree_usage = '''
|
|
20
|
+
===================================== get SCG tree example commands =====================================
|
|
21
|
+
|
|
22
|
+
# for completed genome
|
|
23
|
+
BioSAK get_SCG_tree -i genomes -p NorthSea -x fasta -t 4 -nonmeta
|
|
24
|
+
|
|
25
|
+
# for metagenome-assembled genomes (MAGs)
|
|
26
|
+
BioSAK get_SCG_tree -i genomes -p NorthSea -x fasta -t 4
|
|
27
|
+
|
|
28
|
+
Software dependencies:
|
|
29
|
+
module load hmmer/3.2.1
|
|
30
|
+
module load mafft/7.407
|
|
31
|
+
module load fasttree/2.1.10
|
|
32
|
+
module load R/3.5.3
|
|
33
|
+
module load blast+/2.9.0
|
|
34
|
+
module load prodigal/2.6.3
|
|
35
|
+
|
|
36
|
+
=========================================================================================================
|
|
37
|
+
'''
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def report_and_log(message_for_report, log_file, keep_quiet):
|
|
41
|
+
|
|
42
|
+
time_format = '[%Y-%m-%d %H:%M:%S]'
|
|
43
|
+
with open(log_file, 'a') as log_handle:
|
|
44
|
+
log_handle.write('%s %s\n' % ((datetime.now().strftime(time_format)), message_for_report))
|
|
45
|
+
|
|
46
|
+
if keep_quiet is False:
|
|
47
|
+
print('%s %s' % ((datetime.now().strftime(time_format)), message_for_report))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def force_create_folder(folder_to_create):
|
|
51
|
+
if os.path.isdir(folder_to_create):
|
|
52
|
+
shutil.rmtree(folder_to_create, ignore_errors=True)
|
|
53
|
+
if os.path.isdir(folder_to_create):
|
|
54
|
+
shutil.rmtree(folder_to_create, ignore_errors=True)
|
|
55
|
+
if os.path.isdir(folder_to_create):
|
|
56
|
+
shutil.rmtree(folder_to_create, ignore_errors=True)
|
|
57
|
+
if os.path.isdir(folder_to_create):
|
|
58
|
+
shutil.rmtree(folder_to_create, ignore_errors=True)
|
|
59
|
+
os.mkdir(folder_to_create)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def remove_empty_element(list_in):
|
|
63
|
+
|
|
64
|
+
list_out = []
|
|
65
|
+
for each_element in list_in:
|
|
66
|
+
if each_element != '':
|
|
67
|
+
list_out.append(each_element)
|
|
68
|
+
|
|
69
|
+
return list_out
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_program_path_dict(pwd_cfg_file):
|
|
73
|
+
program_path_dict = {}
|
|
74
|
+
for each in open(pwd_cfg_file):
|
|
75
|
+
each_split = each.strip().split('=')
|
|
76
|
+
program_name = each_split[0]
|
|
77
|
+
program_path = each_split[1]
|
|
78
|
+
|
|
79
|
+
# remove space if there are
|
|
80
|
+
if program_name[-1] == ' ':
|
|
81
|
+
program_name = program_name[:-1]
|
|
82
|
+
if program_path[0] == ' ':
|
|
83
|
+
program_path = program_path[1:]
|
|
84
|
+
program_path_dict[program_name] = program_path
|
|
85
|
+
|
|
86
|
+
return program_path_dict
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def export_dna_record(gene_seq, gene_id, gene_description, output_handle):
|
|
90
|
+
seq_object = Seq(gene_seq, IUPAC.unambiguous_dna)
|
|
91
|
+
seq_record = SeqRecord(seq_object)
|
|
92
|
+
seq_record.id = gene_id
|
|
93
|
+
seq_record.description = gene_description
|
|
94
|
+
SeqIO.write(seq_record, output_handle, 'fasta')
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def export_aa_record(gene_seq, gene_id, gene_description, output_handle):
|
|
98
|
+
seq_object = Seq(gene_seq, IUPAC.protein)
|
|
99
|
+
seq_record = SeqRecord(seq_object)
|
|
100
|
+
seq_record.id = gene_id
|
|
101
|
+
seq_record.description = gene_description
|
|
102
|
+
SeqIO.write(seq_record, output_handle, 'fasta')
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def remove_low_cov_and_consensus_columns(alignment_file_in, minimal_cov, min_consensus, alignment_file_out):
|
|
106
|
+
|
|
107
|
+
def list_to_segments(list_in):
|
|
108
|
+
|
|
109
|
+
segments_out = []
|
|
110
|
+
current_element = None
|
|
111
|
+
current_segment = [None, None]
|
|
112
|
+
for each_element in list_in:
|
|
113
|
+
|
|
114
|
+
# for the first ellment
|
|
115
|
+
if current_element == None:
|
|
116
|
+
current_element = each_element
|
|
117
|
+
current_segment = [each_element, each_element]
|
|
118
|
+
|
|
119
|
+
elif each_element == current_element + 1:
|
|
120
|
+
current_segment[1] = each_element
|
|
121
|
+
current_element = each_element
|
|
122
|
+
|
|
123
|
+
elif each_element != current_element + 1:
|
|
124
|
+
|
|
125
|
+
# add segment to list
|
|
126
|
+
segments_out.append(current_segment)
|
|
127
|
+
|
|
128
|
+
# resetting segment
|
|
129
|
+
current_segment = [each_element, each_element]
|
|
130
|
+
current_element = each_element
|
|
131
|
+
|
|
132
|
+
# add segment to list
|
|
133
|
+
segments_out.append(current_segment)
|
|
134
|
+
|
|
135
|
+
return segments_out
|
|
136
|
+
|
|
137
|
+
def remove_columns_from_msa(alignment_in, cols_to_remove):
|
|
138
|
+
|
|
139
|
+
# get 0 based index of all wanted columns
|
|
140
|
+
cols_to_remove_0_base = [(i - 1) for i in cols_to_remove]
|
|
141
|
+
aln_cols_index_all = list(range(alignment_in.get_alignment_length()))
|
|
142
|
+
aln_cols_index_wanted = []
|
|
143
|
+
for i in aln_cols_index_all:
|
|
144
|
+
if i not in cols_to_remove_0_base:
|
|
145
|
+
aln_cols_index_wanted.append(i)
|
|
146
|
+
|
|
147
|
+
# get wanted alignment segments
|
|
148
|
+
wanted_segments = list_to_segments(aln_cols_index_wanted)
|
|
149
|
+
|
|
150
|
+
# create an empty Alignment object
|
|
151
|
+
alignment_new = Align.MultipleSeqAlignment([])
|
|
152
|
+
for sequence in alignment_in:
|
|
153
|
+
new_seq_object = Seq('')
|
|
154
|
+
new_seq_record = SeqRecord(new_seq_object)
|
|
155
|
+
new_seq_record.id = sequence.id
|
|
156
|
+
new_seq_record.description = sequence.description
|
|
157
|
+
alignment_new.append(new_seq_record)
|
|
158
|
+
|
|
159
|
+
# add wanted columns to empty Alignment object
|
|
160
|
+
for segment in wanted_segments:
|
|
161
|
+
|
|
162
|
+
# for single column segment
|
|
163
|
+
if segment[0] == segment[1]:
|
|
164
|
+
segment_value = alignment_in[:, segment[0]]
|
|
165
|
+
|
|
166
|
+
m = 0
|
|
167
|
+
for each_seq in alignment_new:
|
|
168
|
+
each_seq.seq = Seq(str(each_seq.seq) + segment_value[m])
|
|
169
|
+
m += 1
|
|
170
|
+
|
|
171
|
+
# for multiple columns segment
|
|
172
|
+
else:
|
|
173
|
+
segment_value = alignment_in[:, (segment[0]):(segment[1] + 1)]
|
|
174
|
+
alignment_new += segment_value
|
|
175
|
+
|
|
176
|
+
return alignment_new
|
|
177
|
+
|
|
178
|
+
def remove_low_cov_columns(alignment_in, min_cov_cutoff):
|
|
179
|
+
|
|
180
|
+
# get columns with low coverage
|
|
181
|
+
sequence_number = len(alignment_in)
|
|
182
|
+
total_col_num = alignment_in.get_alignment_length()
|
|
183
|
+
low_cov_columns = []
|
|
184
|
+
n = 0
|
|
185
|
+
while n < total_col_num:
|
|
186
|
+
current_column = alignment_in[:, n]
|
|
187
|
+
dash_number = current_column.count('-')
|
|
188
|
+
gap_percent = (dash_number / sequence_number) * 100
|
|
189
|
+
|
|
190
|
+
if gap_percent > min_cov_cutoff:
|
|
191
|
+
low_cov_columns.append(n + 1)
|
|
192
|
+
|
|
193
|
+
n += 1
|
|
194
|
+
|
|
195
|
+
# remove identified columns
|
|
196
|
+
alignment_new = remove_columns_from_msa(alignment_in, low_cov_columns)
|
|
197
|
+
|
|
198
|
+
return alignment_new
|
|
199
|
+
|
|
200
|
+
def remove_low_consensus_columns(alignment_in, min_css_cutoff):
|
|
201
|
+
|
|
202
|
+
# get columns with low coverage
|
|
203
|
+
sequence_number = len(alignment_in)
|
|
204
|
+
total_col_num = alignment_in.get_alignment_length()
|
|
205
|
+
low_css_columns = []
|
|
206
|
+
n = 0
|
|
207
|
+
while n < total_col_num:
|
|
208
|
+
current_column = alignment_in[:, n]
|
|
209
|
+
|
|
210
|
+
# get all aa in current column
|
|
211
|
+
aa_list = set()
|
|
212
|
+
for aa in current_column:
|
|
213
|
+
aa_list.add(aa)
|
|
214
|
+
|
|
215
|
+
# get maximum aa percent
|
|
216
|
+
most_abundant_aa_percent = 0
|
|
217
|
+
for each_aa in aa_list:
|
|
218
|
+
each_aa_percent = (current_column.count(each_aa) / sequence_number) * 100
|
|
219
|
+
if each_aa_percent > most_abundant_aa_percent:
|
|
220
|
+
most_abundant_aa_percent = each_aa_percent
|
|
221
|
+
|
|
222
|
+
# if maximum percent lower than provided cutoff, add current column to low consensus column list
|
|
223
|
+
if most_abundant_aa_percent < min_css_cutoff:
|
|
224
|
+
low_css_columns.append(n + 1)
|
|
225
|
+
|
|
226
|
+
n += 1
|
|
227
|
+
|
|
228
|
+
# remove identified columns
|
|
229
|
+
alignment_new = remove_columns_from_msa(alignment_in, low_css_columns)
|
|
230
|
+
|
|
231
|
+
return alignment_new
|
|
232
|
+
|
|
233
|
+
# read in alignment
|
|
234
|
+
alignment = AlignIO.read(alignment_file_in, "fasta")
|
|
235
|
+
|
|
236
|
+
# remove_low_cov_columns
|
|
237
|
+
alignment_cov = remove_low_cov_columns(alignment, minimal_cov)
|
|
238
|
+
|
|
239
|
+
# remove_low_consensus_columns
|
|
240
|
+
alignment_cov_css = remove_low_consensus_columns(alignment_cov, min_consensus)
|
|
241
|
+
|
|
242
|
+
# write filtered alignment
|
|
243
|
+
alignment_file_out_handle = open(alignment_file_out, 'w')
|
|
244
|
+
for each_seq in alignment_cov_css:
|
|
245
|
+
alignment_file_out_handle.write('>%s\n' % str(each_seq.id))
|
|
246
|
+
alignment_file_out_handle.write('%s\n' % str(each_seq.seq))
|
|
247
|
+
alignment_file_out_handle.close()
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def prodigal_parser(seq_file, sco_file, prefix, output_folder):
|
|
251
|
+
|
|
252
|
+
bin_ffn_file = '%s.ffn' % prefix
|
|
253
|
+
bin_faa_file = '%s.faa' % prefix
|
|
254
|
+
pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file)
|
|
255
|
+
pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file)
|
|
256
|
+
|
|
257
|
+
# get sequence id list
|
|
258
|
+
id_to_sequence_dict = {}
|
|
259
|
+
sequence_id_list = []
|
|
260
|
+
for each_seq in SeqIO.parse(seq_file, 'fasta'):
|
|
261
|
+
id_to_sequence_dict[each_seq.id] = str(each_seq.seq)
|
|
262
|
+
sequence_id_list.append(each_seq.id)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# get sequence to cds dict and sequence to transl_table dict
|
|
266
|
+
current_seq_id = ''
|
|
267
|
+
current_transl_table = ''
|
|
268
|
+
current_seq_csd_list = []
|
|
269
|
+
seq_to_cds_dict = {}
|
|
270
|
+
seq_to_transl_table_dict = {}
|
|
271
|
+
for each_cds in open(sco_file):
|
|
272
|
+
if each_cds.startswith('# Sequence Data'):
|
|
273
|
+
|
|
274
|
+
# add to dict
|
|
275
|
+
if current_seq_id != '':
|
|
276
|
+
seq_to_cds_dict[current_seq_id] = current_seq_csd_list
|
|
277
|
+
seq_to_transl_table_dict[current_seq_id] = current_transl_table
|
|
278
|
+
|
|
279
|
+
# reset value
|
|
280
|
+
current_seq_id = each_cds.strip().split(';seqhdr=')[1][1:-1].split(' ')[0]
|
|
281
|
+
current_transl_table = ''
|
|
282
|
+
current_seq_csd_list = []
|
|
283
|
+
|
|
284
|
+
elif each_cds.startswith('# Model Data'):
|
|
285
|
+
current_transl_table = each_cds.strip().split(';')[-2].split('=')[-1]
|
|
286
|
+
|
|
287
|
+
else:
|
|
288
|
+
current_seq_csd_list.append('_'.join(each_cds.strip().split('_')[1:]))
|
|
289
|
+
|
|
290
|
+
seq_to_cds_dict[current_seq_id] = current_seq_csd_list
|
|
291
|
+
seq_to_transl_table_dict[current_seq_id] = current_transl_table
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w')
|
|
295
|
+
bin_faa_file_handle = open(pwd_bin_faa_file, 'w')
|
|
296
|
+
gene_index = 1
|
|
297
|
+
for seq_id in sequence_id_list:
|
|
298
|
+
|
|
299
|
+
# create SeqRecord
|
|
300
|
+
current_sequence = Seq(id_to_sequence_dict[seq_id])
|
|
301
|
+
current_SeqRecord = SeqRecord(current_sequence, id=seq_id)
|
|
302
|
+
current_SeqRecord.seq.alphabet = generic_dna
|
|
303
|
+
transl_table = seq_to_transl_table_dict[seq_id]
|
|
304
|
+
|
|
305
|
+
# add SeqFeature to SeqRecord
|
|
306
|
+
for cds in seq_to_cds_dict[seq_id]:
|
|
307
|
+
|
|
308
|
+
# define locus_tag id
|
|
309
|
+
locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index))
|
|
310
|
+
|
|
311
|
+
# define FeatureLocation
|
|
312
|
+
cds_split = cds.split('_')
|
|
313
|
+
cds_start = SF.ExactPosition(int(cds_split[0]))
|
|
314
|
+
cds_end = SF.ExactPosition(int(cds_split[1]))
|
|
315
|
+
cds_strand = cds_split[2]
|
|
316
|
+
current_strand = None
|
|
317
|
+
if cds_strand == '+':
|
|
318
|
+
current_strand = 1
|
|
319
|
+
if cds_strand == '-':
|
|
320
|
+
current_strand = -1
|
|
321
|
+
current_feature_location = FeatureLocation(cds_start, cds_end, strand=current_strand)
|
|
322
|
+
|
|
323
|
+
# get nc sequence
|
|
324
|
+
sequence_nc = ''
|
|
325
|
+
if cds_strand == '+':
|
|
326
|
+
sequence_nc = id_to_sequence_dict[seq_id][cds_start-1:cds_end]
|
|
327
|
+
if cds_strand == '-':
|
|
328
|
+
sequence_nc = str(Seq(id_to_sequence_dict[seq_id][cds_start-1:cds_end], generic_dna).reverse_complement())
|
|
329
|
+
|
|
330
|
+
# translate to aa sequence
|
|
331
|
+
sequence_aa = str(SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table))
|
|
332
|
+
|
|
333
|
+
# remove * at the end
|
|
334
|
+
sequence_aa = sequence_aa[:-1]
|
|
335
|
+
|
|
336
|
+
# export nc and aa sequences
|
|
337
|
+
export_dna_record(sequence_nc, locus_tag_id, '', bin_ffn_file_handle)
|
|
338
|
+
export_aa_record(sequence_aa, locus_tag_id, '', bin_faa_file_handle)
|
|
339
|
+
|
|
340
|
+
# Define feature type
|
|
341
|
+
current_feature_type = 'CDS'
|
|
342
|
+
|
|
343
|
+
# Define feature qualifiers
|
|
344
|
+
current_qualifiers_dict = {}
|
|
345
|
+
current_qualifiers_dict['locus_tag'] = locus_tag_id
|
|
346
|
+
current_qualifiers_dict['transl_table'] = transl_table
|
|
347
|
+
current_qualifiers_dict['translation'] = sequence_aa
|
|
348
|
+
|
|
349
|
+
# Create a SeqFeature
|
|
350
|
+
current_feature = SeqFeature(current_feature_location, type=current_feature_type, qualifiers=current_qualifiers_dict)
|
|
351
|
+
|
|
352
|
+
# Append Feature to SeqRecord
|
|
353
|
+
current_SeqRecord.features.append(current_feature)
|
|
354
|
+
gene_index += 1
|
|
355
|
+
|
|
356
|
+
bin_ffn_file_handle.close()
|
|
357
|
+
bin_faa_file_handle.close()
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def sep_combined_hmm(combined_hmm_file, hmm_profile_sep_folder, hmmfetch_exe, pwd_hmmstat_exe):
|
|
361
|
+
|
|
362
|
+
# extract hmm profile id from phylo.hmm
|
|
363
|
+
pwd_phylo_hmm_stat_txt = '%s/phylo.hmm.stat.txt' % hmm_profile_sep_folder
|
|
364
|
+
hmmstat_cmd = '%s %s > %s' % (pwd_hmmstat_exe, combined_hmm_file, pwd_phylo_hmm_stat_txt)
|
|
365
|
+
os.system(hmmstat_cmd)
|
|
366
|
+
|
|
367
|
+
# get hmm profile id file
|
|
368
|
+
hmm_id_list = []
|
|
369
|
+
for each_profile in open(pwd_phylo_hmm_stat_txt):
|
|
370
|
+
if not each_profile.startswith('#'):
|
|
371
|
+
each_profile_split = each_profile.strip().split(' ')
|
|
372
|
+
if each_profile_split != ['']:
|
|
373
|
+
each_profile_split_no_space = []
|
|
374
|
+
for each_element in each_profile_split:
|
|
375
|
+
if each_element != '':
|
|
376
|
+
each_profile_split_no_space.append(each_element)
|
|
377
|
+
hmm_id_list.append(each_profile_split_no_space[2])
|
|
378
|
+
|
|
379
|
+
for each_hmm_id in hmm_id_list:
|
|
380
|
+
hmmfetch_cmd = '%s %s %s > %s/%s.hmm' % (hmmfetch_exe, combined_hmm_file, each_hmm_id, hmm_profile_sep_folder, each_hmm_id)
|
|
381
|
+
os.system(hmmfetch_cmd)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def prodigal_worker(argument_list):
|
|
385
|
+
|
|
386
|
+
input_genome = argument_list[0]
|
|
387
|
+
input_genome_folder = argument_list[1]
|
|
388
|
+
pwd_prodigal_exe = argument_list[2]
|
|
389
|
+
nonmeta_mode = argument_list[3]
|
|
390
|
+
pwd_prodigal_output_folder = argument_list[4]
|
|
391
|
+
|
|
392
|
+
# prepare command (according to Prokka)
|
|
393
|
+
input_genome_basename, input_genome_ext = os.path.splitext(input_genome)
|
|
394
|
+
pwd_input_genome = '%s/%s' % (input_genome_folder, input_genome)
|
|
395
|
+
pwd_output_sco = '%s/%s.sco' % (pwd_prodigal_output_folder, input_genome_basename)
|
|
396
|
+
|
|
397
|
+
prodigal_cmd_meta = '%s -f sco -q -c -m -g 11 -p meta -i %s -o %s' % (
|
|
398
|
+
pwd_prodigal_exe, pwd_input_genome, pwd_output_sco)
|
|
399
|
+
prodigal_cmd_nonmeta = '%s -f sco -q -c -m -g 11 -i %s -o %s' % (
|
|
400
|
+
pwd_prodigal_exe, pwd_input_genome, pwd_output_sco)
|
|
401
|
+
|
|
402
|
+
if nonmeta_mode is True:
|
|
403
|
+
prodigal_cmd = prodigal_cmd_nonmeta
|
|
404
|
+
else:
|
|
405
|
+
prodigal_cmd = prodigal_cmd_meta
|
|
406
|
+
|
|
407
|
+
os.system(prodigal_cmd)
|
|
408
|
+
|
|
409
|
+
# prepare ffn, faa and gbk files from prodigal output
|
|
410
|
+
prodigal_parser(pwd_input_genome, pwd_output_sco, input_genome_basename, pwd_prodigal_output_folder)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def hmmsearch_worker(argument_list):
|
|
414
|
+
|
|
415
|
+
faa_file_basename = argument_list[0]
|
|
416
|
+
pwd_SCG_tree_wd = argument_list[1]
|
|
417
|
+
pwd_hmmsearch_exe = argument_list[2]
|
|
418
|
+
path_to_hmm = argument_list[3]
|
|
419
|
+
pwd_faa_folder = argument_list[4]
|
|
420
|
+
|
|
421
|
+
# run hmmsearch
|
|
422
|
+
pwd_faa_file = '%s/%s.faa' % (pwd_faa_folder, faa_file_basename)
|
|
423
|
+
os.system('%s -o /dev/null --domtblout %s/%s_hmmout.tbl %s %s' % (pwd_hmmsearch_exe, pwd_SCG_tree_wd, faa_file_basename, path_to_hmm, pwd_faa_file))
|
|
424
|
+
|
|
425
|
+
# Reading the protein file in a dictionary
|
|
426
|
+
proteinSequence = {}
|
|
427
|
+
for seq_record in SeqIO.parse(pwd_faa_file, 'fasta'):
|
|
428
|
+
proteinSequence[seq_record.id] = str(seq_record.seq)
|
|
429
|
+
|
|
430
|
+
# Reading the hmmersearch table/extracting the protein part found beu hmmsearch out of the protein/Writing
|
|
431
|
+
# each protein sequence that was extracted to a fasta file (one for each hmm in phylo.hmm
|
|
432
|
+
hmm_id = ''
|
|
433
|
+
hmm_name = ''
|
|
434
|
+
hmm_pos1 = 0
|
|
435
|
+
hmm_pos2 = 0
|
|
436
|
+
hmm_score = 0
|
|
437
|
+
pwd_hmmout_tbl = pwd_SCG_tree_wd + '/' + faa_file_basename + '_hmmout.tbl'
|
|
438
|
+
with open(pwd_hmmout_tbl, 'r') as tbl:
|
|
439
|
+
for line in tbl:
|
|
440
|
+
if line[0] == "#": continue
|
|
441
|
+
line = re.sub('\s+', ' ', line)
|
|
442
|
+
splitLine = line.split(' ')
|
|
443
|
+
|
|
444
|
+
if (hmm_id == ''):
|
|
445
|
+
hmm_id = splitLine[4]
|
|
446
|
+
hmm_name = splitLine[0]
|
|
447
|
+
hmm_pos1 = int(splitLine[17]) - 1
|
|
448
|
+
hmm_pos2 = int(splitLine[18])
|
|
449
|
+
hmm_score = float(splitLine[13])
|
|
450
|
+
elif (hmm_id == splitLine[4]):
|
|
451
|
+
if (float(splitLine[13]) > hmm_score):
|
|
452
|
+
hmm_name = splitLine[0]
|
|
453
|
+
hmm_pos1 = int(splitLine[17]) - 1
|
|
454
|
+
hmm_pos2 = int(splitLine[18])
|
|
455
|
+
hmm_score = float(splitLine[13])
|
|
456
|
+
else:
|
|
457
|
+
file_out = open(pwd_SCG_tree_wd + '/' + hmm_id + '.fasta', 'a+')
|
|
458
|
+
file_out.write('>' + faa_file_basename + '\n')
|
|
459
|
+
if hmm_name != '':
|
|
460
|
+
seq = str(proteinSequence[hmm_name][hmm_pos1:hmm_pos2])
|
|
461
|
+
file_out.write(str(seq) + '\n')
|
|
462
|
+
file_out.close()
|
|
463
|
+
hmm_id = splitLine[4]
|
|
464
|
+
hmm_name = splitLine[0]
|
|
465
|
+
hmm_pos1 = int(splitLine[17]) - 1
|
|
466
|
+
hmm_pos2 = int(splitLine[18])
|
|
467
|
+
hmm_score = float(splitLine[13])
|
|
468
|
+
|
|
469
|
+
else:
|
|
470
|
+
file_out = open(pwd_SCG_tree_wd + '/' + hmm_id + '.fasta', 'a+')
|
|
471
|
+
file_out.write('>' + faa_file_basename + '\n')
|
|
472
|
+
if hmm_name != '':
|
|
473
|
+
seq = str(proteinSequence[hmm_name][hmm_pos1:hmm_pos2])
|
|
474
|
+
file_out.write(str(seq) + '\n')
|
|
475
|
+
file_out.close()
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def convert_hmmalign_output(align_in, align_out):
|
|
479
|
+
|
|
480
|
+
# read in alignment
|
|
481
|
+
sequence_id_list = []
|
|
482
|
+
sequence_seq_dict = {}
|
|
483
|
+
for aligned_seq in open(align_in):
|
|
484
|
+
aligned_seq_split = aligned_seq.strip().split(' ')
|
|
485
|
+
aligned_seq_split = remove_empty_element(aligned_seq_split)
|
|
486
|
+
|
|
487
|
+
if aligned_seq_split != []:
|
|
488
|
+
aligned_seq_id = aligned_seq_split[0]
|
|
489
|
+
aligned_seq_seq = aligned_seq_split[1]
|
|
490
|
+
|
|
491
|
+
# add id to sequence id list
|
|
492
|
+
if aligned_seq_id not in sequence_id_list:
|
|
493
|
+
sequence_id_list.append(aligned_seq_id)
|
|
494
|
+
|
|
495
|
+
# add seq to sequence seq dict
|
|
496
|
+
if aligned_seq_id not in sequence_seq_dict:
|
|
497
|
+
sequence_seq_dict[aligned_seq_id] = aligned_seq_seq
|
|
498
|
+
else:
|
|
499
|
+
sequence_seq_dict[aligned_seq_id] += aligned_seq_seq
|
|
500
|
+
|
|
501
|
+
# write out
|
|
502
|
+
align_out_handle = open(align_out, 'w')
|
|
503
|
+
for sequence_id in sequence_id_list:
|
|
504
|
+
sequence_seq = sequence_seq_dict[sequence_id]
|
|
505
|
+
align_out_handle.write('>%s\n' % sequence_id)
|
|
506
|
+
align_out_handle.write('%s\n' % sequence_seq)
|
|
507
|
+
align_out_handle.close()
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def hmmalign_worker(argument_list):
|
|
511
|
+
fastaFile_basename = argument_list[0]
|
|
512
|
+
pwd_SCG_tree_wd = argument_list[1]
|
|
513
|
+
pwd_hmm_profile_folder = argument_list[2]
|
|
514
|
+
pwd_hmmalign_exe = argument_list[3]
|
|
515
|
+
|
|
516
|
+
pwd_hmm_file = '%s/%s.hmm' % (pwd_hmm_profile_folder, fastaFile_basename)
|
|
517
|
+
pwd_seq_in = '%s/%s.fasta' % (pwd_SCG_tree_wd, fastaFile_basename)
|
|
518
|
+
pwd_aln_out_tmp = '%s/%s_aligned_tmp.fasta' % (pwd_SCG_tree_wd, fastaFile_basename)
|
|
519
|
+
pwd_aln_out = '%s/%s_aligned.fasta' % (pwd_SCG_tree_wd, fastaFile_basename)
|
|
520
|
+
|
|
521
|
+
hmmalign_cmd = '%s --trim --outformat PSIBLAST %s %s > %s ; rm %s' % (pwd_hmmalign_exe, pwd_hmm_file, pwd_seq_in, pwd_aln_out_tmp, pwd_seq_in)
|
|
522
|
+
os.system(hmmalign_cmd)
|
|
523
|
+
|
|
524
|
+
# convert alignment format
|
|
525
|
+
convert_hmmalign_output(pwd_aln_out_tmp, pwd_aln_out)
|
|
526
|
+
|
|
527
|
+
# remove tmp alignment
|
|
528
|
+
os.system('rm %s' % pwd_aln_out_tmp)
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def get_SCG_tree(args, config_dict):
|
|
532
|
+
|
|
533
|
+
# read in arguments
|
|
534
|
+
input_genome_folder = args['i']
|
|
535
|
+
output_prefix = args['p']
|
|
536
|
+
file_extension = args['x']
|
|
537
|
+
num_threads = args['t']
|
|
538
|
+
nonmeta_mode = args['nonmeta']
|
|
539
|
+
|
|
540
|
+
# read in config file
|
|
541
|
+
path_to_hmm = config_dict['path_to_hmm']
|
|
542
|
+
pwd_prodigal_exe = config_dict['prodigal']
|
|
543
|
+
pwd_hmmsearch_exe = config_dict['hmmsearch']
|
|
544
|
+
pwd_hmmfetch_exe = config_dict['hmmfetch']
|
|
545
|
+
pwd_hmmalign_exe = config_dict['hmmalign']
|
|
546
|
+
pwd_hmmstat_exe = config_dict['hmmstat']
|
|
547
|
+
pwd_fasttree_exe = config_dict['fasttree']
|
|
548
|
+
|
|
549
|
+
warnings.filterwarnings("ignore")
|
|
550
|
+
minimal_cov_in_msa = 50
|
|
551
|
+
min_consensus_in_msa = 25
|
|
552
|
+
keep_quiet = False
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
#################################################### check input ###################################################
|
|
556
|
+
|
|
557
|
+
# check whether input genome exist
|
|
558
|
+
input_genome_file_re = '%s/*.%s' % (input_genome_folder, file_extension)
|
|
559
|
+
input_genome_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_genome_file_re)]
|
|
560
|
+
if input_genome_file_name_list == []:
|
|
561
|
+
print('No input genome detected, program exited!')
|
|
562
|
+
exit()
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
############################################# define file/folder names #############################################
|
|
566
|
+
|
|
567
|
+
get_SCG_tree_wd = '%s_get_SCG_tree_wd' % (output_prefix)
|
|
568
|
+
prodigal_output_folder = '%s_1_prodigal_output' % (output_prefix)
|
|
569
|
+
extract_and_align_SCG_wd = '%s_2_extract_and_align_SCGs' % (output_prefix)
|
|
570
|
+
combined_alignment_file_tmp = '%s_SCG_tree.aln' % (output_prefix)
|
|
571
|
+
combined_alignment_file = '%s_SCG_tree_cov%s_css%s.aln' % (output_prefix, minimal_cov_in_msa, min_consensus_in_msa)
|
|
572
|
+
newick_tree_file = '%s_SCG_tree.newick' % (output_prefix)
|
|
573
|
+
hmm_profile_sep_folder = '%s_hmm_profile_fetched' % (output_prefix)
|
|
574
|
+
|
|
575
|
+
pwd_log_file = '%s/%s_get_SCG_tree.log' % (get_SCG_tree_wd, output_prefix)
|
|
576
|
+
pwd_prodigal_output_folder = '%s/%s' % (get_SCG_tree_wd, prodigal_output_folder)
|
|
577
|
+
pwd_extract_and_align_SCG_wd = '%s/%s' % (get_SCG_tree_wd, extract_and_align_SCG_wd)
|
|
578
|
+
pwd_combined_alignment_file_tmp = '%s/%s' % (get_SCG_tree_wd, combined_alignment_file_tmp)
|
|
579
|
+
pwd_combined_alignment_file = '%s/%s' % (get_SCG_tree_wd, combined_alignment_file)
|
|
580
|
+
pwd_hmm_profile_sep_folder = '%s/%s/%s' % (get_SCG_tree_wd, extract_and_align_SCG_wd, hmm_profile_sep_folder)
|
|
581
|
+
pwd_newick_tree_file = '%s/%s' % (get_SCG_tree_wd, newick_tree_file)
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
# create wd
|
|
585
|
+
force_create_folder(get_SCG_tree_wd)
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
######################################## run prodigal with multiprocessing #########################################
|
|
589
|
+
|
|
590
|
+
# for report and log
|
|
591
|
+
report_and_log(('Running Prodigal with %s cores for input genomes' % num_threads), pwd_log_file, keep_quiet)
|
|
592
|
+
|
|
593
|
+
# create prodigal output folder
|
|
594
|
+
force_create_folder(pwd_prodigal_output_folder)
|
|
595
|
+
|
|
596
|
+
# get input genome list
|
|
597
|
+
input_genome_file_re = '%s/*.%s' % (input_genome_folder, file_extension)
|
|
598
|
+
input_genome_file_name_list = [os.path.basename(file_name) for file_name in glob.glob(input_genome_file_re)]
|
|
599
|
+
|
|
600
|
+
# prepare arguments for prodigal_worker
|
|
601
|
+
list_for_multiple_arguments_Prodigal = []
|
|
602
|
+
for input_genome in input_genome_file_name_list:
|
|
603
|
+
list_for_multiple_arguments_Prodigal.append([input_genome, input_genome_folder, pwd_prodigal_exe, nonmeta_mode, pwd_prodigal_output_folder])
|
|
604
|
+
|
|
605
|
+
# run prodigal with multiprocessing
|
|
606
|
+
pool = mp.Pool(processes=num_threads)
|
|
607
|
+
pool.map(prodigal_worker, list_for_multiple_arguments_Prodigal)
|
|
608
|
+
pool.close()
|
|
609
|
+
pool.join()
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
########################################### get species tree (hmmsearch) ###########################################
|
|
613
|
+
|
|
614
|
+
# create wd
|
|
615
|
+
force_create_folder(pwd_extract_and_align_SCG_wd)
|
|
616
|
+
|
|
617
|
+
# for report and log
|
|
618
|
+
report_and_log(('Running Hmmsearch with %s cores' % num_threads), pwd_log_file, keep_quiet)
|
|
619
|
+
|
|
620
|
+
faa_file_re = '%s/*.faa' % pwd_prodigal_output_folder
|
|
621
|
+
faa_file_list = [os.path.basename(file_name) for file_name in glob.glob(faa_file_re)]
|
|
622
|
+
faa_file_list = sorted(faa_file_list)
|
|
623
|
+
|
|
624
|
+
faa_file_basename_list = []
|
|
625
|
+
for faa_file in faa_file_list:
|
|
626
|
+
faa_file_basename, faa_file_extension = os.path.splitext(faa_file)
|
|
627
|
+
faa_file_basename_list.append(faa_file_basename)
|
|
628
|
+
|
|
629
|
+
# prepare arguments for hmmsearch_worker
|
|
630
|
+
list_for_multiple_arguments_hmmsearch = []
|
|
631
|
+
for faa_file_basename in faa_file_basename_list:
|
|
632
|
+
list_for_multiple_arguments_hmmsearch.append([faa_file_basename, pwd_extract_and_align_SCG_wd, pwd_hmmsearch_exe, path_to_hmm, pwd_prodigal_output_folder])
|
|
633
|
+
|
|
634
|
+
# run hmmsearch with multiprocessing
|
|
635
|
+
pool = mp.Pool(processes=num_threads)
|
|
636
|
+
pool.map(hmmsearch_worker, list_for_multiple_arguments_hmmsearch)
|
|
637
|
+
pool.close()
|
|
638
|
+
pool.join()
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
############################################# get species tree (hmmalign) #############################################
|
|
642
|
+
|
|
643
|
+
# for report and log
|
|
644
|
+
report_and_log(('Running Hmmalign with %s cores' % num_threads), pwd_log_file, keep_quiet)
|
|
645
|
+
|
|
646
|
+
# fetch combined hmm profiles
|
|
647
|
+
force_create_folder(pwd_hmm_profile_sep_folder)
|
|
648
|
+
sep_combined_hmm(path_to_hmm, pwd_hmm_profile_sep_folder, pwd_hmmfetch_exe, pwd_hmmstat_exe)
|
|
649
|
+
|
|
650
|
+
# Call hmmalign to align all single fasta files with hmms
|
|
651
|
+
files = os.listdir(pwd_extract_and_align_SCG_wd)
|
|
652
|
+
fastaFiles = [i for i in files if i.endswith('.fasta')]
|
|
653
|
+
|
|
654
|
+
# prepare arguments for hmmalign_worker
|
|
655
|
+
list_for_multiple_arguments_hmmalign = []
|
|
656
|
+
for fastaFile in fastaFiles:
|
|
657
|
+
|
|
658
|
+
fastaFiles_basename = '.'.join(fastaFile.split('.')[:-1])
|
|
659
|
+
list_for_multiple_arguments_hmmalign.append([fastaFiles_basename, pwd_extract_and_align_SCG_wd, pwd_hmm_profile_sep_folder, pwd_hmmalign_exe])
|
|
660
|
+
|
|
661
|
+
# run hmmalign with multiprocessing
|
|
662
|
+
pool = mp.Pool(processes=num_threads)
|
|
663
|
+
pool.map(hmmalign_worker, list_for_multiple_arguments_hmmalign)
|
|
664
|
+
pool.close()
|
|
665
|
+
pool.join()
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
################################### get species tree (Concatenating alignments) ####################################
|
|
669
|
+
|
|
670
|
+
# for report and log
|
|
671
|
+
report_and_log('Concatenating alignments', pwd_log_file, keep_quiet)
|
|
672
|
+
|
|
673
|
+
# concatenating the single alignments
|
|
674
|
+
concatAlignment = {}
|
|
675
|
+
for element in faa_file_basename_list:
|
|
676
|
+
concatAlignment[element] = ''
|
|
677
|
+
|
|
678
|
+
# Reading all single alignment files and append them to the concatenated alignment
|
|
679
|
+
files = os.listdir(pwd_extract_and_align_SCG_wd)
|
|
680
|
+
fastaFiles = [i for i in files if i.endswith('.fasta')]
|
|
681
|
+
for faa_file_basename in fastaFiles:
|
|
682
|
+
fastaFile = pwd_extract_and_align_SCG_wd + '/' + faa_file_basename
|
|
683
|
+
proteinSequence = {}
|
|
684
|
+
alignmentLength = 0
|
|
685
|
+
for seq_record_2 in SeqIO.parse(fastaFile, 'fasta'):
|
|
686
|
+
proteinName = seq_record_2.id
|
|
687
|
+
proteinSequence[proteinName] = str(seq_record_2.seq)
|
|
688
|
+
alignmentLength = len(proteinSequence[proteinName])
|
|
689
|
+
|
|
690
|
+
for element in faa_file_basename_list:
|
|
691
|
+
if element in proteinSequence.keys():
|
|
692
|
+
concatAlignment[element] += proteinSequence[element]
|
|
693
|
+
else:
|
|
694
|
+
concatAlignment[element] += '-' * alignmentLength
|
|
695
|
+
|
|
696
|
+
# writing alignment to file
|
|
697
|
+
file_out = open(pwd_combined_alignment_file_tmp, 'w')
|
|
698
|
+
for element in faa_file_basename_list:
|
|
699
|
+
file_out.write('>' + element + '\n' + concatAlignment[element] + '\n')
|
|
700
|
+
file_out.close()
|
|
701
|
+
|
|
702
|
+
# remove columns with low coverage and low consensus
|
|
703
|
+
report_and_log(('Removing columns from concatenated alignment represented by <%s%s of genomes and with an amino acid consensus <%s%s' % (minimal_cov_in_msa, '%', min_consensus_in_msa, '%')), pwd_log_file, keep_quiet)
|
|
704
|
+
remove_low_cov_and_consensus_columns(pwd_combined_alignment_file_tmp, minimal_cov_in_msa, min_consensus_in_msa, pwd_combined_alignment_file)
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
########################################### get species tree (fasttree) ############################################
|
|
708
|
+
|
|
709
|
+
# for report and log
|
|
710
|
+
report_and_log('Running FastTree', pwd_log_file, keep_quiet)
|
|
711
|
+
|
|
712
|
+
# calling fasttree for tree calculation
|
|
713
|
+
fasttree_cmd = '%s -quiet %s > %s' % (pwd_fasttree_exe, pwd_combined_alignment_file, pwd_newick_tree_file)
|
|
714
|
+
os.system(fasttree_cmd)
|
|
715
|
+
|
|
716
|
+
# for report and log
|
|
717
|
+
report_and_log(('SCG tree exported to: %s' % newick_tree_file), pwd_log_file, keep_quiet)
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
############################################## remove temporary files ##############################################
|
|
721
|
+
|
|
722
|
+
# remove temporary files
|
|
723
|
+
report_and_log(('Deleting temporary files'), pwd_log_file, keep_quiet)
|
|
724
|
+
|
|
725
|
+
os.system('rm -r %s' % pwd_combined_alignment_file_tmp)
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
if __name__ == '__main__':
|
|
729
|
+
|
|
730
|
+
# initialize the options parser
|
|
731
|
+
parser = argparse.ArgumentParser()
|
|
732
|
+
|
|
733
|
+
# arguments for PI
|
|
734
|
+
parser.add_argument('-i', required=True, help='input genome folder')
|
|
735
|
+
parser.add_argument('-p', required=True, help='output prefix')
|
|
736
|
+
parser.add_argument('-x', required=False, default='fasta', help='file extension')
|
|
737
|
+
parser.add_argument('-nonmeta', required=False, action="store_true", help='annotate Non-metagenome-assembled genomes (Non-MAGs)')
|
|
738
|
+
parser.add_argument('-t', required=False, type=int, default=1, help='number of threads, default: 1')
|
|
739
|
+
|
|
740
|
+
args = vars(parser.parse_args())
|
|
741
|
+
|
|
742
|
+
get_SCG_tree(args, config_dict)
|