treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from Bio import AlignIO
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
AssessMarkerDeltaLL_usage = '''
|
|
9
|
+
============================= AssessMarkerDeltaLL example commands =============================
|
|
10
|
+
|
|
11
|
+
Dependencies: iqtree
|
|
12
|
+
|
|
13
|
+
# example commands
|
|
14
|
+
TreeSAK AssessMarkerDeltaLL -deltall DeltaLL_stdout.txt -o op_dir -c 25-50-75-100 -mmn 20 -aln trimmed_aln_dir -jst 6 -qsub -f
|
|
15
|
+
|
|
16
|
+
================================================================================================
|
|
17
|
+
'''
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def sep_path_basename_ext(file_in):
|
|
21
|
+
|
|
22
|
+
# separate path and file name
|
|
23
|
+
file_path, file_name = os.path.split(file_in)
|
|
24
|
+
if file_path == '':
|
|
25
|
+
file_path = '.'
|
|
26
|
+
|
|
27
|
+
# separate file basename and extension
|
|
28
|
+
file_basename, file_extension = os.path.splitext(file_name)
|
|
29
|
+
|
|
30
|
+
return file_path, file_basename, file_extension
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
|
|
34
|
+
|
|
35
|
+
concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
|
|
36
|
+
msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
|
|
37
|
+
msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
|
|
38
|
+
msa_file_list_sorted = sorted(msa_file_list)
|
|
39
|
+
|
|
40
|
+
complete_gnm_set = set()
|
|
41
|
+
for each_msa_file in msa_file_list:
|
|
42
|
+
pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
|
|
43
|
+
for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
|
|
44
|
+
complete_gnm_set.add(each_seq.id)
|
|
45
|
+
|
|
46
|
+
complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
|
|
47
|
+
|
|
48
|
+
# initialize concatenated msa dict
|
|
49
|
+
gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
|
|
50
|
+
msa_len_dict = dict()
|
|
51
|
+
for each_msa_file in msa_file_list_sorted:
|
|
52
|
+
gene_id = each_msa_file.split('.' + msa_ext)[0]
|
|
53
|
+
|
|
54
|
+
# read in msa
|
|
55
|
+
current_msa_len = 0
|
|
56
|
+
current_msa_len_set = set()
|
|
57
|
+
pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
|
|
58
|
+
current_msa_seq_dict = dict()
|
|
59
|
+
for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
|
|
60
|
+
complete_gnm_set.add(each_seq.id)
|
|
61
|
+
current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
|
|
62
|
+
current_msa_len_set.add(len(each_seq.seq))
|
|
63
|
+
current_msa_len = len(each_seq.seq)
|
|
64
|
+
|
|
65
|
+
if len(current_msa_len_set) != 1:
|
|
66
|
+
print('Sequences with different length were found in %s, program exited!' % each_msa_file)
|
|
67
|
+
exit()
|
|
68
|
+
|
|
69
|
+
msa_len_dict[gene_id] = current_msa_len
|
|
70
|
+
|
|
71
|
+
# add sequence to concatenated msa dict
|
|
72
|
+
for each_gnm in complete_gnm_list_sorted:
|
|
73
|
+
msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
|
|
74
|
+
gnm_to_seq_dict[each_gnm] += msa_seq
|
|
75
|
+
|
|
76
|
+
# write out concatenated msa
|
|
77
|
+
concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
|
|
78
|
+
for each_gnm in complete_gnm_list_sorted:
|
|
79
|
+
concatenated_msa_handle.write('>%s\n' % each_gnm)
|
|
80
|
+
concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
|
|
81
|
+
concatenated_msa_handle.close()
|
|
82
|
+
|
|
83
|
+
# write out partition file
|
|
84
|
+
end_pos = 0
|
|
85
|
+
partition_file_handle = open(partition_file, 'w')
|
|
86
|
+
for each_m in msa_file_list_sorted:
|
|
87
|
+
gene_id = each_m.split('.' + msa_ext)[0]
|
|
88
|
+
current_m_len = msa_len_dict[gene_id]
|
|
89
|
+
partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
|
|
90
|
+
end_pos += current_m_len
|
|
91
|
+
partition_file_handle.close()
|
|
92
|
+
|
|
93
|
+
# convert msa in fasta to phy
|
|
94
|
+
AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def submit_js(js):
|
|
98
|
+
current_wd = os.getcwd()
|
|
99
|
+
js_path, js_basename, js_ext = sep_path_basename_ext(js)
|
|
100
|
+
os.chdir(js_path)
|
|
101
|
+
os.system('qsub %s%s' % (js_basename, js_ext))
|
|
102
|
+
os.chdir(current_wd)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def AssessMarkerDeltaLL(args):
|
|
106
|
+
|
|
107
|
+
deltall_stdout_txt = args['deltall']
|
|
108
|
+
op_dir = args['o']
|
|
109
|
+
deltall_keep_pct_str = args['c']
|
|
110
|
+
min_marker_num = args['mmn']
|
|
111
|
+
trimmed_aln_dir = args['aln']
|
|
112
|
+
js_cpu_num = args['jst']
|
|
113
|
+
submit_job = args['qsub']
|
|
114
|
+
force_overwrite = args['f']
|
|
115
|
+
|
|
116
|
+
deltall_keep_pct_list = [int(i) for i in deltall_keep_pct_str.split('-')]
|
|
117
|
+
|
|
118
|
+
deltall_stdout_path, deltall_stdout_basename, deltall_stdout_ext = sep_path_basename_ext(deltall_stdout_txt)
|
|
119
|
+
|
|
120
|
+
# create dir
|
|
121
|
+
if os.path.isdir(op_dir) is True:
|
|
122
|
+
if force_overwrite is True:
|
|
123
|
+
os.system('rm -r %s' % op_dir)
|
|
124
|
+
else:
|
|
125
|
+
print('output folder detected, program exited!')
|
|
126
|
+
exit()
|
|
127
|
+
os.system('mkdir %s' % op_dir)
|
|
128
|
+
|
|
129
|
+
# define file name
|
|
130
|
+
deltall_stdout_summary_txt = '%s/%s_summary.txt' % (op_dir, deltall_stdout_basename)
|
|
131
|
+
|
|
132
|
+
# read in deltall_stdout_txt
|
|
133
|
+
deltall_op_dict = dict()
|
|
134
|
+
for each_line in open(deltall_stdout_txt):
|
|
135
|
+
if not ((each_line.startswith('WARNING:')) or (each_line.startswith('awk:'))):
|
|
136
|
+
each_line_split = each_line.strip().split('\t')
|
|
137
|
+
marker_id = each_line_split[0]
|
|
138
|
+
value = float(each_line_split[1])
|
|
139
|
+
if marker_id not in deltall_op_dict:
|
|
140
|
+
deltall_op_dict[marker_id] = [value]
|
|
141
|
+
else:
|
|
142
|
+
deltall_op_dict[marker_id].append(value)
|
|
143
|
+
|
|
144
|
+
# assigned score to marker
|
|
145
|
+
metric_1_dict = dict()
|
|
146
|
+
metric_2_dict = dict()
|
|
147
|
+
for each_marker in deltall_op_dict:
|
|
148
|
+
metric_1_value = float("{0:.2f}".format(deltall_op_dict[each_marker][0]))
|
|
149
|
+
metric_2_value = float("{0:.2f}".format(deltall_op_dict[each_marker][1]))
|
|
150
|
+
metric_1_dict[each_marker] = metric_1_value
|
|
151
|
+
metric_2_dict[each_marker] = metric_2_value
|
|
152
|
+
|
|
153
|
+
metric_1_dict_sorted = {k: v for k, v in sorted(metric_1_dict.items(), key=lambda item: item[1])[::-1]}
|
|
154
|
+
metric_2_dict_sorted = {k: v for k, v in sorted(metric_2_dict.items(), key=lambda item: item[1])}
|
|
155
|
+
|
|
156
|
+
metric_1_score_dict = dict()
|
|
157
|
+
metric_1_score = 1
|
|
158
|
+
for each_marker_1 in metric_1_dict_sorted:
|
|
159
|
+
metric_1_score_dict[each_marker_1] = metric_1_score
|
|
160
|
+
metric_1_score += 1
|
|
161
|
+
|
|
162
|
+
metric_2_score_dict = dict()
|
|
163
|
+
metric_2_score = 1
|
|
164
|
+
for each_marker_2 in metric_2_dict_sorted:
|
|
165
|
+
metric_2_score_dict[each_marker_2] = metric_2_score
|
|
166
|
+
metric_2_score += 1
|
|
167
|
+
|
|
168
|
+
overall_score_dict = dict()
|
|
169
|
+
for each_marker in deltall_op_dict:
|
|
170
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
171
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
172
|
+
metric_score_overall = metric_score_1 + metric_score_2
|
|
173
|
+
overall_score_dict[each_marker] = metric_score_overall
|
|
174
|
+
|
|
175
|
+
overall_score_dict_sorted = {k: v for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])}
|
|
176
|
+
marker_list_sorted_by_deltall = [k for k, v in sorted(overall_score_dict.items(), key=lambda item: item[1])]
|
|
177
|
+
|
|
178
|
+
# write out summary_txt
|
|
179
|
+
summary_txt_handle = open(deltall_stdout_summary_txt, 'w')
|
|
180
|
+
summary_txt_handle.write('Marker\tmetric1\tmetric1_score\tmetric2\tmetric2_score\toverall_score\n')
|
|
181
|
+
for each_marker in overall_score_dict_sorted:
|
|
182
|
+
metric_value_1 = metric_1_dict[each_marker]
|
|
183
|
+
metric_value_2 = metric_2_dict[each_marker]
|
|
184
|
+
metric_score_1 = metric_1_score_dict[each_marker]
|
|
185
|
+
metric_score_2 = metric_2_score_dict[each_marker]
|
|
186
|
+
metric_score_overall = overall_score_dict_sorted[each_marker]
|
|
187
|
+
summary_txt_handle.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (each_marker, metric_value_1, metric_score_1, metric_value_2, metric_score_2, metric_score_overall))
|
|
188
|
+
summary_txt_handle.close()
|
|
189
|
+
|
|
190
|
+
# get qualified marker list
|
|
191
|
+
for each_keep_pct in deltall_keep_pct_list:
|
|
192
|
+
marker_num_to_keep = round(len(marker_list_sorted_by_deltall)*each_keep_pct/100)
|
|
193
|
+
markers_to_keep_id_list = marker_list_sorted_by_deltall[:marker_num_to_keep]
|
|
194
|
+
|
|
195
|
+
if marker_num_to_keep < min_marker_num:
|
|
196
|
+
print('Ignored DeltaLL cutoff at %s , the number of qualified markers (%s) less than %s' % (each_keep_pct, marker_num_to_keep, min_marker_num))
|
|
197
|
+
else:
|
|
198
|
+
pwd_aln_dir = '%s/%s_DeltaLL_%s_trimmed_aln' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
199
|
+
pwd_aln_concatenated = '%s/%s_DeltaLL_%s_concatenated.phy' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
200
|
+
pwd_aln_concatenated_partitions = '%s/%s_DeltaLL_%s_concatenated_partition.txt' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
201
|
+
pwd_iqtree_guide_tree_wd = '%s/%s_DeltaLL_%s_iqtree_guide_tree' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
202
|
+
pwd_iqtree_c60_pmsf_wd = '%s/%s_DeltaLL_%s_iqtree_C60_PMSF' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
203
|
+
pwd_iqtree_js = '%s/js_%s_DeltaLL_%s_iqtree.sh' % (op_dir, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
204
|
+
|
|
205
|
+
# create dir
|
|
206
|
+
os.mkdir(pwd_aln_dir)
|
|
207
|
+
|
|
208
|
+
# copy msa of qualified markers
|
|
209
|
+
for each_marker in markers_to_keep_id_list:
|
|
210
|
+
pwd_marker_aln = '%s/%s.aln' % (trimmed_aln_dir, each_marker)
|
|
211
|
+
cp_cmd = 'cp %s %s/' % (pwd_marker_aln, pwd_aln_dir)
|
|
212
|
+
os.system(cp_cmd)
|
|
213
|
+
|
|
214
|
+
# concatenate msa
|
|
215
|
+
catfasta2phy(pwd_aln_dir, 'aln', pwd_aln_concatenated, pwd_aln_concatenated_partitions)
|
|
216
|
+
|
|
217
|
+
# create dir
|
|
218
|
+
os.mkdir(pwd_iqtree_guide_tree_wd)
|
|
219
|
+
os.mkdir(pwd_iqtree_c60_pmsf_wd)
|
|
220
|
+
|
|
221
|
+
# run iqtree
|
|
222
|
+
get_guide_tree_cmd = 'iqtree -s ../%s_DeltaLL_%s_concatenated.phy --prefix guide_tree --seqtype AA -m LG -T %s -B 1000 --alrt 1000 --quiet' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct, js_cpu_num)
|
|
223
|
+
get_c60_tree_cmd = 'iqtree -s ../%s_DeltaLL_%s_concatenated.phy --prefix concatenated --seqtype AA -m LG+G+F+C60 -T %s -B 1000 --alrt 1000 --quiet -ft ../%s_DeltaLL_%s_iqtree_guide_tree/guide_tree.treefile' % (deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct, js_cpu_num, deltall_stdout_basename.split('_DeltaLL_stdout')[0], each_keep_pct)
|
|
224
|
+
|
|
225
|
+
# write job script
|
|
226
|
+
with open(pwd_iqtree_js, 'w') as pwd_iqtree_js_handle:
|
|
227
|
+
pwd_iqtree_js_handle.write('#!/bin/bash\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task %s\n\n' % js_cpu_num)
|
|
228
|
+
pwd_iqtree_js_handle.write('cd %s/%s\n' % (os.getcwd(), pwd_iqtree_guide_tree_wd))
|
|
229
|
+
pwd_iqtree_js_handle.write(get_guide_tree_cmd + '\n\n')
|
|
230
|
+
pwd_iqtree_js_handle.write('cd %s/%s\n' % (os.getcwd(), pwd_iqtree_c60_pmsf_wd))
|
|
231
|
+
pwd_iqtree_js_handle.write(get_c60_tree_cmd + '\n')
|
|
232
|
+
|
|
233
|
+
if submit_job is True:
|
|
234
|
+
print(pwd_iqtree_js)
|
|
235
|
+
submit_js(pwd_iqtree_js)
|
|
236
|
+
else:
|
|
237
|
+
print('Job script for running iqtree exported to %s' % pwd_iqtree_js)
|
|
238
|
+
|
|
239
|
+
# prepare files for performing dating
|
|
240
|
+
print('Preparing files for performing dating')
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
if __name__ == '__main__':
|
|
245
|
+
|
|
246
|
+
# initialize the options parser
|
|
247
|
+
parser = argparse.ArgumentParser()
|
|
248
|
+
parser.add_argument('-deltall', required=True, help='DeltaLL stdout')
|
|
249
|
+
parser.add_argument('-o', required=True, help='output dir')
|
|
250
|
+
parser.add_argument('-c', required=False, default='25-50-75-100', help='cutoffs, default: 25-50-75-100')
|
|
251
|
+
parser.add_argument('-mmn', required=False, default=20, type=int, help='minimal marker number, default: 20')
|
|
252
|
+
parser.add_argument('-aln', required=True, help='faa file dir')
|
|
253
|
+
parser.add_argument('-jst', required=False, default='6', help='threads to request in job script, for running iqtree')
|
|
254
|
+
parser.add_argument('-qsub', required=False, action="store_true", help='submit job scripts')
|
|
255
|
+
parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
256
|
+
args = vars(parser.parse_args())
|
|
257
|
+
AssessMarkerDeltaLL(args)
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from Bio import AlignIO
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
AssessMarkerPA_usage = '''
|
|
9
|
+
=========================== AssessMarkerPA example commands ===========================
|
|
10
|
+
|
|
11
|
+
BioSAK AssessMarkerPA -ta trimmed_aln -tax aln -aa faa_files -aax faa -g gnm_group.txt -c 25-50-75-100 -o s10_assess_marker_PA
|
|
12
|
+
|
|
13
|
+
Note
|
|
14
|
+
1. Extra genomes in gnm_metadata.txt won't affect assessment results.
|
|
15
|
+
2. Genomes can not be found in gnm_metadata.txt will trigger an error.
|
|
16
|
+
3. Alignments in {trimmed_aln_dir} need to be trimmed before assessment
|
|
17
|
+
4. Sequences in MSAs need to be named by genome id.
|
|
18
|
+
|
|
19
|
+
=======================================================================================
|
|
20
|
+
'''
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def sep_path_basename_ext(file_in):
|
|
24
|
+
file_path, file_name = os.path.split(file_in)
|
|
25
|
+
if file_path == '':
|
|
26
|
+
file_path = '.'
|
|
27
|
+
file_basename, file_extension = os.path.splitext(file_name)
|
|
28
|
+
return file_path, file_basename, file_extension
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def catfasta2phy(msa_dir, msa_ext, concatenated_msa_phy, partition_file):
|
|
32
|
+
|
|
33
|
+
concatenated_msa_fasta = '%s.fasta' % concatenated_msa_phy
|
|
34
|
+
msa_file_re = '%s/*.%s' % (msa_dir, msa_ext)
|
|
35
|
+
msa_file_list = [os.path.basename(file_name) for file_name in glob.glob(msa_file_re)]
|
|
36
|
+
msa_file_list_sorted = sorted(msa_file_list)
|
|
37
|
+
|
|
38
|
+
complete_gnm_set = set()
|
|
39
|
+
for each_msa_file in msa_file_list:
|
|
40
|
+
pwd_msa = '%s/%s' % (msa_dir, each_msa_file)
|
|
41
|
+
for each_seq in SeqIO.parse(pwd_msa, 'fasta'):
|
|
42
|
+
complete_gnm_set.add(each_seq.id)
|
|
43
|
+
|
|
44
|
+
complete_gnm_list_sorted = sorted([i for i in complete_gnm_set])
|
|
45
|
+
|
|
46
|
+
# initialize concatenated msa dict
|
|
47
|
+
gnm_to_seq_dict = {i: '' for i in complete_gnm_list_sorted}
|
|
48
|
+
msa_len_dict = dict()
|
|
49
|
+
for each_msa_file in msa_file_list_sorted:
|
|
50
|
+
gene_id = each_msa_file.split('.' + msa_ext)[0]
|
|
51
|
+
|
|
52
|
+
# read in msa
|
|
53
|
+
current_msa_len = 0
|
|
54
|
+
current_msa_len_set = set()
|
|
55
|
+
pwd_current_msa = '%s/%s' % (msa_dir, each_msa_file)
|
|
56
|
+
current_msa_seq_dict = dict()
|
|
57
|
+
for each_seq in SeqIO.parse(pwd_current_msa, 'fasta'):
|
|
58
|
+
complete_gnm_set.add(each_seq.id)
|
|
59
|
+
current_msa_seq_dict[each_seq.id] = str(each_seq.seq)
|
|
60
|
+
current_msa_len_set.add(len(each_seq.seq))
|
|
61
|
+
current_msa_len = len(each_seq.seq)
|
|
62
|
+
|
|
63
|
+
if len(current_msa_len_set) != 1:
|
|
64
|
+
print('Sequences with different length were found in %s, program exited!' % each_msa_file)
|
|
65
|
+
exit()
|
|
66
|
+
|
|
67
|
+
msa_len_dict[gene_id] = current_msa_len
|
|
68
|
+
|
|
69
|
+
# add sequence to concatenated msa dict
|
|
70
|
+
for each_gnm in complete_gnm_list_sorted:
|
|
71
|
+
msa_seq = current_msa_seq_dict.get(each_gnm, current_msa_len*'-')
|
|
72
|
+
gnm_to_seq_dict[each_gnm] += msa_seq
|
|
73
|
+
|
|
74
|
+
# write out concatenated msa
|
|
75
|
+
concatenated_msa_handle = open(concatenated_msa_fasta, 'w')
|
|
76
|
+
for each_gnm in complete_gnm_list_sorted:
|
|
77
|
+
concatenated_msa_handle.write('>%s\n' % each_gnm)
|
|
78
|
+
concatenated_msa_handle.write('%s\n' % gnm_to_seq_dict[each_gnm])
|
|
79
|
+
concatenated_msa_handle.close()
|
|
80
|
+
|
|
81
|
+
# write out partition file
|
|
82
|
+
end_pos = 0
|
|
83
|
+
partition_file_handle = open(partition_file, 'w')
|
|
84
|
+
for each_m in msa_file_list_sorted:
|
|
85
|
+
gene_id = each_m.split('.' + msa_ext)[0]
|
|
86
|
+
current_m_len = msa_len_dict[gene_id]
|
|
87
|
+
partition_file_handle.write('%s = %s-%s\n' % (each_m, (end_pos + 1), (end_pos + current_m_len)))
|
|
88
|
+
end_pos += current_m_len
|
|
89
|
+
partition_file_handle.close()
|
|
90
|
+
|
|
91
|
+
# convert msa in fasta to phy
|
|
92
|
+
AlignIO.convert(concatenated_msa_fasta, 'fasta', concatenated_msa_phy, 'phylip-relaxed')
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def AssessMarkerPA(args):
|
|
96
|
+
|
|
97
|
+
trimmed_aln_dir = args['ta']
|
|
98
|
+
trimmed_aln_ext = args['tax']
|
|
99
|
+
faa_file_dir = args['aa']
|
|
100
|
+
faa_file_ext = args['aax']
|
|
101
|
+
gnm_group_txt = args['g']
|
|
102
|
+
cutoff_str = args['c']
|
|
103
|
+
op_dir = args['o']
|
|
104
|
+
force_overwriting = args['f']
|
|
105
|
+
#catfasta2phyml_pl = args['pl']
|
|
106
|
+
js_cpu_num = args['jst']
|
|
107
|
+
|
|
108
|
+
# get gnm id list
|
|
109
|
+
faa_file_re = '%s/*.%s' % (faa_file_dir, faa_file_ext)
|
|
110
|
+
faa_file_list = [os.path.basename(file_name) for file_name in glob.glob(faa_file_re)]
|
|
111
|
+
gnm_set = set()
|
|
112
|
+
for each_faa_file in faa_file_list:
|
|
113
|
+
faa_path, faa_basename, faa_ext = sep_path_basename_ext(each_faa_file)
|
|
114
|
+
gnm_set.add(faa_basename)
|
|
115
|
+
|
|
116
|
+
# read in genome metadata
|
|
117
|
+
group_to_gnm_dict = dict()
|
|
118
|
+
group_to_gnm_num_dict = dict()
|
|
119
|
+
gnm_to_group_dict = dict()
|
|
120
|
+
for each_gnm in open(gnm_group_txt):
|
|
121
|
+
each_gnm_split = each_gnm.strip().split('\t')
|
|
122
|
+
gnm_id = each_gnm_split[0]
|
|
123
|
+
domain_name = each_gnm_split[1]
|
|
124
|
+
|
|
125
|
+
if gnm_id in gnm_set:
|
|
126
|
+
gnm_to_group_dict[gnm_id] = domain_name
|
|
127
|
+
|
|
128
|
+
if domain_name not in group_to_gnm_num_dict:
|
|
129
|
+
group_to_gnm_num_dict[domain_name] = 1
|
|
130
|
+
else:
|
|
131
|
+
group_to_gnm_num_dict[domain_name] += 1
|
|
132
|
+
|
|
133
|
+
if domain_name not in group_to_gnm_dict:
|
|
134
|
+
group_to_gnm_dict[domain_name] = {gnm_id}
|
|
135
|
+
else:
|
|
136
|
+
group_to_gnm_dict[domain_name].add(gnm_id)
|
|
137
|
+
|
|
138
|
+
group_id_list_sorted = sorted(list(group_to_gnm_dict.keys()))
|
|
139
|
+
|
|
140
|
+
# exit program if group information is missing
|
|
141
|
+
gnms_without_group_info = set()
|
|
142
|
+
for gnm in gnm_set:
|
|
143
|
+
if gnm not in gnm_to_group_dict:
|
|
144
|
+
gnms_without_group_info.add(gnm)
|
|
145
|
+
|
|
146
|
+
if len(gnms_without_group_info) > 0:
|
|
147
|
+
print('Group information for the following genomes are missing from %s, program exited!' % gnm_group_txt)
|
|
148
|
+
print(','.join(gnms_without_group_info))
|
|
149
|
+
print('Group information for the above genomes are missing from %s, program exited!' % gnm_group_txt)
|
|
150
|
+
exit()
|
|
151
|
+
|
|
152
|
+
# create folder
|
|
153
|
+
if force_overwriting is True:
|
|
154
|
+
if os.path.isdir(op_dir) is True:
|
|
155
|
+
os.system('rm -r %s' % op_dir)
|
|
156
|
+
else:
|
|
157
|
+
if os.path.isdir(op_dir) is True:
|
|
158
|
+
print('Output folder already exist, program exited!')
|
|
159
|
+
exit()
|
|
160
|
+
os.system('mkdir %s' % op_dir)
|
|
161
|
+
|
|
162
|
+
# read in provided cutoffs
|
|
163
|
+
present_pct_cutoff_list = [int(i) for i in cutoff_str.split('-')]
|
|
164
|
+
assess_summary_1_txt = '%s/assessment_PA.txt' % op_dir
|
|
165
|
+
assess_summary_2_txt = '%s/assessment_PA_summary.txt' % op_dir
|
|
166
|
+
itol_binary_txt = '%s/assessment_PA_iTOL_binary.txt' % op_dir
|
|
167
|
+
|
|
168
|
+
trimmed_aln_file_re = '%s/*.%s' % (trimmed_aln_dir, trimmed_aln_ext)
|
|
169
|
+
trimmed_aln_file_list = [os.path.basename(file_name) for file_name in glob.glob(trimmed_aln_file_re)]
|
|
170
|
+
|
|
171
|
+
assess_summary_1_txt_handle = open(assess_summary_1_txt, 'w')
|
|
172
|
+
assess_summary_1_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in group_id_list_sorted]))
|
|
173
|
+
assess_summary_2_txt_handle = open(assess_summary_2_txt, 'w')
|
|
174
|
+
assess_summary_2_txt_handle.write('Marker\t%s\n' % '\t'.join([str(i) for i in present_pct_cutoff_list]))
|
|
175
|
+
cutoff_to_qualified_marker_dict = dict()
|
|
176
|
+
gnm_to_identified_marker_dict = dict()
|
|
177
|
+
marker_id_list = []
|
|
178
|
+
for each_aln in trimmed_aln_file_list:
|
|
179
|
+
|
|
180
|
+
marker_id = each_aln.split(('.%s' % trimmed_aln_ext))[0]
|
|
181
|
+
marker_id_list.append(marker_id)
|
|
182
|
+
pwd_aln = '%s/%s' % (trimmed_aln_dir, each_aln)
|
|
183
|
+
|
|
184
|
+
current_marker_num_by_group_dict = dict()
|
|
185
|
+
for each_seq in SeqIO.parse(pwd_aln, 'fasta'):
|
|
186
|
+
gnm_id = each_seq.id
|
|
187
|
+
|
|
188
|
+
# get genome to marker dist
|
|
189
|
+
if gnm_id not in gnm_to_identified_marker_dict:
|
|
190
|
+
gnm_to_identified_marker_dict[gnm_id] = {marker_id}
|
|
191
|
+
else:
|
|
192
|
+
gnm_to_identified_marker_dict[gnm_id].add(marker_id)
|
|
193
|
+
|
|
194
|
+
if gnm_id in gnm_to_group_dict:
|
|
195
|
+
gnm_group = gnm_to_group_dict[gnm_id]
|
|
196
|
+
if gnm_group not in current_marker_num_by_group_dict:
|
|
197
|
+
current_marker_num_by_group_dict[gnm_group] = 1
|
|
198
|
+
else:
|
|
199
|
+
current_marker_num_by_group_dict[gnm_group] += 1
|
|
200
|
+
else:
|
|
201
|
+
print('Not all genomes used to generate the MSA being found in -aa, program exited!')
|
|
202
|
+
exit()
|
|
203
|
+
|
|
204
|
+
# write out assess_summary_1_txt
|
|
205
|
+
pct_list = []
|
|
206
|
+
for each_grp in group_id_list_sorted:
|
|
207
|
+
grp_pct = current_marker_num_by_group_dict.get(each_grp, 0)*100/group_to_gnm_num_dict[each_grp]
|
|
208
|
+
grp_pct = float("{0:.2f}".format(grp_pct))
|
|
209
|
+
pct_list.append(grp_pct)
|
|
210
|
+
assess_summary_1_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join([str(i) for i in pct_list])))
|
|
211
|
+
|
|
212
|
+
# write out assess_summary_2_txt
|
|
213
|
+
assess_list = []
|
|
214
|
+
for each_cutoff in present_pct_cutoff_list:
|
|
215
|
+
|
|
216
|
+
good_marker = True
|
|
217
|
+
for each_pct in pct_list:
|
|
218
|
+
if each_pct < each_cutoff:
|
|
219
|
+
good_marker = False
|
|
220
|
+
|
|
221
|
+
if each_cutoff not in cutoff_to_qualified_marker_dict:
|
|
222
|
+
cutoff_to_qualified_marker_dict[each_cutoff] = {marker_id}
|
|
223
|
+
|
|
224
|
+
if good_marker is True:
|
|
225
|
+
assess_list.append('1')
|
|
226
|
+
cutoff_to_qualified_marker_dict[each_cutoff].add(marker_id)
|
|
227
|
+
else:
|
|
228
|
+
assess_list.append('0')
|
|
229
|
+
assess_summary_2_txt_handle.write('%s\t%s\n' % (marker_id, '\t'.join(assess_list)))
|
|
230
|
+
|
|
231
|
+
# write out total in assess_summary_2_txt
|
|
232
|
+
total_stats_list = [str(len(cutoff_to_qualified_marker_dict[each_c])) for each_c in present_pct_cutoff_list]
|
|
233
|
+
assess_summary_2_txt_handle.write('Total\t%s\n' % ('\t'.join(total_stats_list)))
|
|
234
|
+
assess_summary_1_txt_handle.close()
|
|
235
|
+
assess_summary_2_txt_handle.close()
|
|
236
|
+
|
|
237
|
+
# copy alignments of qualified marker to corresponding folders
|
|
238
|
+
for each_cutoff in cutoff_to_qualified_marker_dict:
|
|
239
|
+
qualified_marker_set = cutoff_to_qualified_marker_dict[each_cutoff]
|
|
240
|
+
|
|
241
|
+
qualified_marker_phy = 'qualified_marker_PA_%s_concatenated.phy' % each_cutoff
|
|
242
|
+
pwd_qualified_marker_dir = '%s/qualified_marker_PA_%s' % (op_dir, each_cutoff)
|
|
243
|
+
pwd_qualified_marker_id_txt = '%s/qualified_marker_PA_%s_id.txt' % (op_dir, each_cutoff)
|
|
244
|
+
pwd_qualified_marker_phy = '%s/qualified_marker_PA_%s_concatenated.phy' % (op_dir, each_cutoff)
|
|
245
|
+
pwd_qualified_marker_partition = '%s/qualified_marker_PA_%s_concatenated_partition.txt' % (op_dir, each_cutoff)
|
|
246
|
+
pwd_js_iqtree = '%s/js_iqtree_PA_%s.sh' % (op_dir, each_cutoff)
|
|
247
|
+
|
|
248
|
+
os.system('mkdir %s' % pwd_qualified_marker_dir)
|
|
249
|
+
for each_marker in qualified_marker_set:
|
|
250
|
+
pwd_marker_aln = '%s/%s.%s' % (trimmed_aln_dir, each_marker, trimmed_aln_ext)
|
|
251
|
+
cp_cmd = 'cp %s %s/' % (pwd_marker_aln, pwd_qualified_marker_dir)
|
|
252
|
+
os.system(cp_cmd)
|
|
253
|
+
|
|
254
|
+
# write out id
|
|
255
|
+
with open(pwd_qualified_marker_id_txt, 'w') as pwd_qualified_marker_id_txt_handle:
|
|
256
|
+
pwd_qualified_marker_id_txt_handle.write('%s\n' % '\n'.join(qualified_marker_set))
|
|
257
|
+
|
|
258
|
+
# concatenate qualified alignments
|
|
259
|
+
#catfasta2phyml_cmd = 'perl %s --sequential --concatenate %s/*.aln > %s 2> %s' % (catfasta2phyml_pl, pwd_qualified_marker_dir, pwd_qualified_marker_phy, pwd_qualified_marker_partition)
|
|
260
|
+
#print('running: ' + catfasta2phyml_cmd)
|
|
261
|
+
#os.system(catfasta2phyml_cmd)
|
|
262
|
+
catfasta2phy(pwd_qualified_marker_dir, 'aln', pwd_qualified_marker_phy, pwd_qualified_marker_partition)
|
|
263
|
+
|
|
264
|
+
# write out iqtree js
|
|
265
|
+
guide_tree_dir = 'qualified_marker_PA_%s_guide_tree' % each_cutoff
|
|
266
|
+
iqtree_C60_PMSF_dir = 'qualified_marker_PA_%s_iqtree_C60_PMSF' % each_cutoff
|
|
267
|
+
pwd_guide_tree_dir = '%s/%s' % (op_dir, guide_tree_dir)
|
|
268
|
+
pwd_iqtree_C60_PMSF_dir = '%s/%s' % (op_dir, iqtree_C60_PMSF_dir)
|
|
269
|
+
|
|
270
|
+
with open(pwd_js_iqtree, 'w') as pwd_js_iqtree_handle:
|
|
271
|
+
pwd_js_iqtree_handle.write('#!/bin/bash\n#SBATCH --ntasks 1\n#SBATCH --cpus-per-task %s\n' % js_cpu_num)
|
|
272
|
+
pwd_js_iqtree_handle.write('mkdir %s/%s\n' % (os.getcwd(), pwd_guide_tree_dir))
|
|
273
|
+
pwd_js_iqtree_handle.write('cd %s/%s\n' % (os.getcwd(), pwd_guide_tree_dir))
|
|
274
|
+
pwd_js_iqtree_handle.write('iqtree -s ../%s --prefix guide_tree --seqtype AA -m LG -T %s -B 1000 --alrt 1000\n' % (qualified_marker_phy, js_cpu_num))
|
|
275
|
+
pwd_js_iqtree_handle.write('mkdir %s/%s\n' % (os.getcwd(), pwd_iqtree_C60_PMSF_dir))
|
|
276
|
+
pwd_js_iqtree_handle.write('cd %s/%s\n' % (os.getcwd(), pwd_iqtree_C60_PMSF_dir))
|
|
277
|
+
pwd_js_iqtree_handle.write('iqtree -s ../%s --prefix concatenated --seqtype AA -m LG+G+F+C60 -T %s -B 1000 --alrt 1000 -ft ../%s/guide_tree.treefile\n' % (qualified_marker_phy, js_cpu_num, guide_tree_dir))
|
|
278
|
+
|
|
279
|
+
# write out iTOL file
|
|
280
|
+
itol_binary_txt_handle = open(itol_binary_txt, 'w')
|
|
281
|
+
itol_binary_txt_handle.write('DATASET_BINARY\n\nSEPARATOR TAB\nDATASET_LABEL\tlabel1\nCOLOR\t#85C1E9\n')
|
|
282
|
+
itol_binary_txt_handle.write('SHOW_LABELS\t1\nLABEL_ROTATION\t45\nLABEL_SHIFT\t5\n')
|
|
283
|
+
itol_binary_txt_handle.write('FIELD_LABELS\t%s\n' % '\t'.join(sorted(marker_id_list)))
|
|
284
|
+
itol_binary_txt_handle.write('FIELD_SHAPES\t%s\n' % '\t'.join(['1']*len(marker_id_list)))
|
|
285
|
+
itol_binary_txt_handle.write('\nDATA\n')
|
|
286
|
+
for each_g in gnm_to_identified_marker_dict:
|
|
287
|
+
g_identified_marker_set = gnm_to_identified_marker_dict[each_g]
|
|
288
|
+
|
|
289
|
+
pa_list = []
|
|
290
|
+
for each_m in sorted(marker_id_list):
|
|
291
|
+
if each_m in g_identified_marker_set:
|
|
292
|
+
pa_list.append('1')
|
|
293
|
+
else:
|
|
294
|
+
pa_list.append('-1')
|
|
295
|
+
itol_binary_txt_handle.write('%s\t%s\n' % (each_g, '\t'.join(pa_list)))
|
|
296
|
+
itol_binary_txt_handle.close()
|
|
297
|
+
|
|
298
|
+
print('Assessment results exported to:\n%s\n%s' % (assess_summary_1_txt, assess_summary_2_txt))
|
|
299
|
+
print('Done!')
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
if __name__ == '__main__':
|
|
303
|
+
|
|
304
|
+
parser = argparse.ArgumentParser()
|
|
305
|
+
parser.add_argument('-ta', required=True, help='trimmed alignments')
|
|
306
|
+
parser.add_argument('-tax', required=True, help='extension of trimmed alignments')
|
|
307
|
+
parser.add_argument('-aa', required=True, help='faa file dir')
|
|
308
|
+
parser.add_argument('-aax', required=True, help='faa file ext')
|
|
309
|
+
parser.add_argument('-g', required=True, help='genome group')
|
|
310
|
+
parser.add_argument('-c', required=False, default='50-75-100', help='cutoffs, default: 50-75-100')
|
|
311
|
+
parser.add_argument('-o', required=True, help='output dir')
|
|
312
|
+
parser.add_argument('-f', required=False, action="store_true", help='force overwrite existing output folder')
|
|
313
|
+
#parser.add_argument('-pl', required=True, help='path to catfasta2phyml.pl')
|
|
314
|
+
parser.add_argument('-jst', required=False, default='3', help='threads to request in job script')
|
|
315
|
+
parser.add_argument('-qsub', required=False, action="store_true", help='submit job script')
|
|
316
|
+
args = vars(parser.parse_args())
|
|
317
|
+
AssessMarkerPA(args)
|