treesak 1.53.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TreeSAK/ALE.py +63 -0
- TreeSAK/ALE1.py +268 -0
- TreeSAK/ALE2.py +168 -0
- TreeSAK/ALE2RTC.py +30 -0
- TreeSAK/ALE3.py +205 -0
- TreeSAK/ALE4.py +636 -0
- TreeSAK/ALE5.py +210 -0
- TreeSAK/ALE6.py +401 -0
- TreeSAK/ALE7.py +126 -0
- TreeSAK/ALE_backup.py +1081 -0
- TreeSAK/AssessCVG.py +128 -0
- TreeSAK/AssessMarker.py +306 -0
- TreeSAK/AssessMarkerDeltaLL.py +257 -0
- TreeSAK/AssessMarkerPA.py +317 -0
- TreeSAK/AssessPB.py +113 -0
- TreeSAK/BMGE.jar +0 -0
- TreeSAK/BMGE.py +49 -0
- TreeSAK/C60SR4.nex +127 -0
- TreeSAK/CompareMCMC.py +138 -0
- TreeSAK/ConcateMSA.py +111 -0
- TreeSAK/ConvertMSA.py +135 -0
- TreeSAK/Dir.rb +82 -0
- TreeSAK/ExtractMarkerSeq.py +263 -0
- TreeSAK/FastRoot.py +1175 -0
- TreeSAK/FastRoot_backup.py +1122 -0
- TreeSAK/FigTree.py +34 -0
- TreeSAK/GTDB_tree.py +76 -0
- TreeSAK/GeneTree.py +142 -0
- TreeSAK/KEGG_Luo17.py +807 -0
- TreeSAK/LcaToLeaves.py +66 -0
- TreeSAK/MarkerRef2Tree.py +616 -0
- TreeSAK/MarkerRef2Tree_backup.py +628 -0
- TreeSAK/MarkerSeq2Tree.py +299 -0
- TreeSAK/MarkerSeq2Tree_backup.py +259 -0
- TreeSAK/ModifyTopo.py +116 -0
- TreeSAK/Newick_tree_plotter.py +79 -0
- TreeSAK/OMA.py +170 -0
- TreeSAK/OMA2.py +212 -0
- TreeSAK/OneLineAln.py +50 -0
- TreeSAK/PB.py +155 -0
- TreeSAK/PMSF.py +115 -0
- TreeSAK/PhyloBiAssoc.R +84 -0
- TreeSAK/PhyloBiAssoc.py +167 -0
- TreeSAK/PlotMCMC.py +41 -0
- TreeSAK/PlotMcmcNode.py +152 -0
- TreeSAK/PlotMcmcNode_old.py +252 -0
- TreeSAK/RootTree.py +101 -0
- TreeSAK/RootTreeGTDB.py +371 -0
- TreeSAK/RootTreeGTDB214.py +288 -0
- TreeSAK/RootTreeGTDB220.py +300 -0
- TreeSAK/SequentialDating.py +16 -0
- TreeSAK/SingleAleHGT.py +157 -0
- TreeSAK/SingleLinePhy.py +50 -0
- TreeSAK/SliceMSA.py +142 -0
- TreeSAK/SplitScore.py +21 -0
- TreeSAK/SplitScore1.py +177 -0
- TreeSAK/SplitScore1OMA.py +148 -0
- TreeSAK/SplitScore2.py +608 -0
- TreeSAK/TaxaCountStats.R +256 -0
- TreeSAK/TaxonTree.py +47 -0
- TreeSAK/TreeSAK_config.py +32 -0
- TreeSAK/VERSION +164 -0
- TreeSAK/VisHPD95.R +45 -0
- TreeSAK/VisHPD95.py +200 -0
- TreeSAK/__init__.py +0 -0
- TreeSAK/ale_parser.py +74 -0
- TreeSAK/ale_splitter.py +63 -0
- TreeSAK/alignment_pruner.pl +1471 -0
- TreeSAK/assessOG.py +45 -0
- TreeSAK/batch_itol.py +171 -0
- TreeSAK/catfasta2phy.py +140 -0
- TreeSAK/cogTree.py +185 -0
- TreeSAK/compare_trees.R +30 -0
- TreeSAK/compare_trees.py +255 -0
- TreeSAK/dating.py +264 -0
- TreeSAK/dating_ss.py +361 -0
- TreeSAK/deltall.py +82 -0
- TreeSAK/do_rrtc.rb +464 -0
- TreeSAK/fa2phy.py +42 -0
- TreeSAK/filter_rename_ar53.py +118 -0
- TreeSAK/format_leaf_name.py +70 -0
- TreeSAK/gap_stats.py +38 -0
- TreeSAK/get_SCG_tree.py +742 -0
- TreeSAK/get_arCOG_seq.py +97 -0
- TreeSAK/global_functions.py +222 -0
- TreeSAK/gnm_leaves.py +43 -0
- TreeSAK/iTOL.py +791 -0
- TreeSAK/iTOL_gene_tree.py +80 -0
- TreeSAK/itol_msa_stats.py +56 -0
- TreeSAK/keep_highest_rrtc.py +37 -0
- TreeSAK/koTree.py +194 -0
- TreeSAK/label_gene_tree_by_gnm.py +34 -0
- TreeSAK/label_tree.R +75 -0
- TreeSAK/label_tree.py +121 -0
- TreeSAK/mad.py +708 -0
- TreeSAK/mcmc2tree.py +58 -0
- TreeSAK/mcmcTC copy.py +92 -0
- TreeSAK/mcmcTC.py +104 -0
- TreeSAK/mcmctree_vs_reltime.R +44 -0
- TreeSAK/mcmctree_vs_reltime.py +252 -0
- TreeSAK/merge_pdf.py +32 -0
- TreeSAK/pRTC.py +56 -0
- TreeSAK/parse_mcmctree.py +198 -0
- TreeSAK/parse_reltime.py +141 -0
- TreeSAK/phy2fa.py +37 -0
- TreeSAK/plot_distruibution_th.py +165 -0
- TreeSAK/prep_mcmctree_ctl.py +92 -0
- TreeSAK/print_leaves.py +32 -0
- TreeSAK/pruneMSA.py +63 -0
- TreeSAK/recode.py +73 -0
- TreeSAK/remove_bias.R +112 -0
- TreeSAK/rename_leaves.py +78 -0
- TreeSAK/replace_clade.py +55 -0
- TreeSAK/root_with_out_group.py +84 -0
- TreeSAK/run_TaxaCountStats_R_s1.py +455 -0
- TreeSAK/subsample_drep_gnms.py +74 -0
- TreeSAK/subset.py +69 -0
- TreeSAK/subset_tree_stupid_old_way.py +193 -0
- TreeSAK/supertree.py +330 -0
- TreeSAK/tmp_1.py +19 -0
- TreeSAK/tmp_2.py +19 -0
- TreeSAK/tmp_3.py +120 -0
- TreeSAK/tmp_4.py +43 -0
- TreeSAK/tmp_5.py +12 -0
- TreeSAK/weighted_rand.rb +23 -0
- treesak-1.53.3.data/scripts/TreeSAK +955 -0
- treesak-1.53.3.dist-info/LICENSE +674 -0
- treesak-1.53.3.dist-info/METADATA +27 -0
- treesak-1.53.3.dist-info/RECORD +131 -0
- treesak-1.53.3.dist-info/WHEEL +5 -0
- treesak-1.53.3.dist-info/top_level.txt +1 -0
TreeSAK/compare_trees.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import argparse
|
|
4
|
+
from ete3 import Tree
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
compare_trees_usage = '''
|
|
9
|
+
======================== compare_trees example command ========================
|
|
10
|
+
|
|
11
|
+
TreeSAK compare_trees -t1 tree_1.newick -t2 tree_2.newick -o op_dir
|
|
12
|
+
TreeSAK compare_trees -t1 tree_dir -t2 tree_dir -tx newick -dm -t 12 -o op_dir
|
|
13
|
+
|
|
14
|
+
===============================================================================
|
|
15
|
+
'''
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def sep_path_basename_ext(file_in):
|
|
19
|
+
|
|
20
|
+
# separate path and file name
|
|
21
|
+
file_path, file_name = os.path.split(file_in)
|
|
22
|
+
if file_path == '':
|
|
23
|
+
file_path = '.'
|
|
24
|
+
|
|
25
|
+
# separate file basename and extension
|
|
26
|
+
file_basename, file_extension = os.path.splitext(file_name)
|
|
27
|
+
|
|
28
|
+
return file_path, file_basename, file_extension
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check_numeric(str_in):
|
|
32
|
+
is_numeric = True
|
|
33
|
+
try:
|
|
34
|
+
x = float(str_in)
|
|
35
|
+
except ValueError:
|
|
36
|
+
is_numeric = False
|
|
37
|
+
|
|
38
|
+
return is_numeric
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_mantel_stats(mantel_stats_txt):
|
|
42
|
+
|
|
43
|
+
mantel_similarity = 'na'
|
|
44
|
+
for each_line in open(mantel_stats_txt):
|
|
45
|
+
if 'Mantel statistic r: ' in each_line:
|
|
46
|
+
mantel_similarity = each_line.strip().split('Mantel statistic r: ')[1]
|
|
47
|
+
return mantel_similarity
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_matrix(query_tree_list, subject_tree_list, mantel_stats_dir, write_out_dm, output_matrix, output_matrix_distance):
|
|
51
|
+
|
|
52
|
+
header_line_str = '\t' + '\t'.join(subject_tree_list) + '\n'
|
|
53
|
+
|
|
54
|
+
output_matrix_handle = open(output_matrix, 'w')
|
|
55
|
+
output_matrix_handle.write(header_line_str)
|
|
56
|
+
distance_lol = []
|
|
57
|
+
for each_qt in query_tree_list:
|
|
58
|
+
|
|
59
|
+
current_qt_mantel_stats_value_list = [each_qt]
|
|
60
|
+
for each_st in subject_tree_list:
|
|
61
|
+
|
|
62
|
+
qt_vs_st_mantel_stats = '%s/%s_vs_%s_mantel_stats.txt' % (mantel_stats_dir, each_qt, each_st)
|
|
63
|
+
st_vs_qt_mantel_stats = '%s/%s_vs_%s_mantel_stats.txt' % (mantel_stats_dir, each_st, each_qt)
|
|
64
|
+
|
|
65
|
+
tree_similarity = 'na'
|
|
66
|
+
if os.path.isfile(qt_vs_st_mantel_stats) is True:
|
|
67
|
+
tree_similarity = parse_mantel_stats(qt_vs_st_mantel_stats)
|
|
68
|
+
if os.path.isfile(st_vs_qt_mantel_stats) is True:
|
|
69
|
+
tree_similarity = parse_mantel_stats(st_vs_qt_mantel_stats)
|
|
70
|
+
|
|
71
|
+
current_qt_mantel_stats_value_list.append(tree_similarity)
|
|
72
|
+
|
|
73
|
+
current_qt_mantel_stats_value_list_distance = [each_qt]
|
|
74
|
+
for each_value in current_qt_mantel_stats_value_list[1:]:
|
|
75
|
+
if check_numeric(each_value) is True:
|
|
76
|
+
in_distance = 1 - float(each_value)
|
|
77
|
+
in_distance = float("{0:.4f}".format(in_distance))
|
|
78
|
+
if in_distance == 0:
|
|
79
|
+
in_distance = '0'
|
|
80
|
+
current_qt_mantel_stats_value_list_distance.append(str(in_distance))
|
|
81
|
+
else:
|
|
82
|
+
current_qt_mantel_stats_value_list_distance.append('na')
|
|
83
|
+
|
|
84
|
+
distance_lol.append(current_qt_mantel_stats_value_list_distance)
|
|
85
|
+
current_qt_mantel_stats_value_str = '\t'.join(current_qt_mantel_stats_value_list)
|
|
86
|
+
output_matrix_handle.write(current_qt_mantel_stats_value_str + '\n')
|
|
87
|
+
output_matrix_handle.close()
|
|
88
|
+
|
|
89
|
+
# write out distance matrix
|
|
90
|
+
if write_out_dm is True:
|
|
91
|
+
output_matrix_distance_handle = open(output_matrix_distance, 'w')
|
|
92
|
+
output_matrix_distance_handle.write(header_line_str)
|
|
93
|
+
for each_list in distance_lol:
|
|
94
|
+
output_matrix_distance_handle.write('\t'.join(each_list) + '\n')
|
|
95
|
+
output_matrix_distance_handle.close()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compare_trees_worker(arg_list):
|
|
99
|
+
|
|
100
|
+
compare_trees_R = arg_list[0]
|
|
101
|
+
tree_file_1 = arg_list[1]
|
|
102
|
+
tree_file_2 = arg_list[2]
|
|
103
|
+
tmp_dir = arg_list[3]
|
|
104
|
+
keep_tmp_file = arg_list[4]
|
|
105
|
+
|
|
106
|
+
tree1_path, tree1_basename, tree1_extension = sep_path_basename_ext(tree_file_1)
|
|
107
|
+
tree2_path, tree2_basename, tree2_extension = sep_path_basename_ext(tree_file_2)
|
|
108
|
+
|
|
109
|
+
op_stats = '%s/%s_vs_%s_mantel_stats.txt' % (tmp_dir, tree1_basename, tree2_basename)
|
|
110
|
+
|
|
111
|
+
t1 = Tree(tree_file_1, format=1)
|
|
112
|
+
t2 = Tree(tree_file_2, format=1)
|
|
113
|
+
|
|
114
|
+
tree1_leaf_list = []
|
|
115
|
+
for leaf1 in t1:
|
|
116
|
+
tree1_leaf_list.append(leaf1.name)
|
|
117
|
+
|
|
118
|
+
tree2_leaf_list = []
|
|
119
|
+
for leaf2 in t2:
|
|
120
|
+
tree2_leaf_list.append(leaf2.name)
|
|
121
|
+
|
|
122
|
+
shared_leaves = set(tree1_leaf_list).intersection(tree2_leaf_list)
|
|
123
|
+
if len(shared_leaves) == 0:
|
|
124
|
+
print('No leaves shared between %s and %s, calculation skipped!' % (tree1_basename, tree2_basename))
|
|
125
|
+
#exit()
|
|
126
|
+
|
|
127
|
+
elif len(tree1_leaf_list) == len(tree2_leaf_list) == len(shared_leaves):
|
|
128
|
+
compare_trees_cmd = 'Rscript %s -a %s -b %s > %s' % (compare_trees_R, tree_file_1, tree_file_2, op_stats)
|
|
129
|
+
os.system(compare_trees_cmd)
|
|
130
|
+
|
|
131
|
+
elif (len(shared_leaves) != len(tree1_leaf_list)) or (len(shared_leaves) != len(tree2_leaf_list)):
|
|
132
|
+
print('Performing Mantel test based on %s leaves shared by %s (%s) and %s (%s)' % (len(shared_leaves), tree1_basename, len(tree1_leaf_list), tree2_basename, len(tree2_leaf_list)))
|
|
133
|
+
|
|
134
|
+
# write out shared leaves
|
|
135
|
+
shared_leaves_txt = '%s/%s_vs_%s_shared_leaves.txt' % (tmp_dir, tree1_basename, tree2_basename)
|
|
136
|
+
shared_leaves_txt_handle = open(shared_leaves_txt, 'w')
|
|
137
|
+
for each_shared_leaf in shared_leaves:
|
|
138
|
+
shared_leaves_txt_handle.write(each_shared_leaf + '\n')
|
|
139
|
+
shared_leaves_txt_handle.close()
|
|
140
|
+
|
|
141
|
+
# subset_tree
|
|
142
|
+
t1_subset = '%s/%s_vs_%s_%s_subset%s' % (tmp_dir, tree1_basename, tree2_basename, tree1_basename, tree1_extension)
|
|
143
|
+
t2_subset = '%s/%s_vs_%s_%s_subset%s' % (tmp_dir, tree1_basename, tree2_basename, tree2_basename, tree2_extension)
|
|
144
|
+
subset_cmd_t1 = 'BioSAK subset_tree -tree %s -taxon %s -out %s -q' % (tree_file_1, shared_leaves_txt, t1_subset)
|
|
145
|
+
subset_cmd_t2 = 'BioSAK subset_tree -tree %s -taxon %s -out %s -q' % (tree_file_2, shared_leaves_txt, t2_subset)
|
|
146
|
+
os.system(subset_cmd_t1)
|
|
147
|
+
os.system(subset_cmd_t2)
|
|
148
|
+
|
|
149
|
+
compare_trees_cmd = 'Rscript %s -a %s -b %s > %s' % (compare_trees_R, t1_subset, t2_subset, op_stats)
|
|
150
|
+
os.system(compare_trees_cmd)
|
|
151
|
+
|
|
152
|
+
if keep_tmp_file is False:
|
|
153
|
+
os.system('rm %s' % shared_leaves_txt)
|
|
154
|
+
os.system('rm %s' % t1_subset)
|
|
155
|
+
os.system('rm %s' % t2_subset)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def compare_trees(args):
|
|
159
|
+
|
|
160
|
+
op_dir = args['o']
|
|
161
|
+
tree_file_1 = args['t1']
|
|
162
|
+
tree_file_2 = args['t2']
|
|
163
|
+
tree_file_ext = args['tx']
|
|
164
|
+
export_dm = args['dm']
|
|
165
|
+
num_threads = args['t']
|
|
166
|
+
keep_tmp = args['tmp']
|
|
167
|
+
force_create_op_dir = args['f']
|
|
168
|
+
|
|
169
|
+
current_file_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])
|
|
170
|
+
compare_trees_R = '%s/compare_trees.R' % current_file_path
|
|
171
|
+
tmp_dir = '%s/tmp' % op_dir
|
|
172
|
+
|
|
173
|
+
query_tree_list = []
|
|
174
|
+
if os.path.isfile(tree_file_1):
|
|
175
|
+
query_tree_list = [tree_file_1]
|
|
176
|
+
elif os.path.isdir(tree_file_1):
|
|
177
|
+
query_tree_re = '%s/*.%s' % (tree_file_1, tree_file_ext)
|
|
178
|
+
query_tree_list = glob.glob(query_tree_re)
|
|
179
|
+
|
|
180
|
+
subject_tree_list = []
|
|
181
|
+
if os.path.isfile(tree_file_2):
|
|
182
|
+
subject_tree_list = [tree_file_2]
|
|
183
|
+
elif os.path.isdir(tree_file_2):
|
|
184
|
+
subject_tree_re = '%s/*.%s' % (tree_file_2, tree_file_ext)
|
|
185
|
+
subject_tree_list = glob.glob(subject_tree_re)
|
|
186
|
+
|
|
187
|
+
# prepare arg list for compare_trees_worker
|
|
188
|
+
to_be_calculated_set = set()
|
|
189
|
+
list_for_compare_trees_worker = []
|
|
190
|
+
for each_query_tree in query_tree_list:
|
|
191
|
+
for each_subject_tree in subject_tree_list:
|
|
192
|
+
|
|
193
|
+
tree_1_vs_2 = '%s_vs_%s' % (each_query_tree, each_subject_tree)
|
|
194
|
+
tree_2_vs_1 = '%s_vs_%s' % (each_subject_tree, each_query_tree)
|
|
195
|
+
|
|
196
|
+
if tree_1_vs_2 not in to_be_calculated_set:
|
|
197
|
+
list_for_compare_trees_worker.append([compare_trees_R, each_query_tree, each_subject_tree, tmp_dir, keep_tmp])
|
|
198
|
+
to_be_calculated_set.add(tree_1_vs_2)
|
|
199
|
+
to_be_calculated_set.add(tree_2_vs_1)
|
|
200
|
+
|
|
201
|
+
print('Total pairs of trees to compare: %s' % len(list_for_compare_trees_worker))
|
|
202
|
+
|
|
203
|
+
# create op_dir
|
|
204
|
+
if os.path.isdir(op_dir) is True:
|
|
205
|
+
if force_create_op_dir is True:
|
|
206
|
+
os.system('rm -r %s' % op_dir)
|
|
207
|
+
else:
|
|
208
|
+
print('Output folder detected, program exited!')
|
|
209
|
+
exit()
|
|
210
|
+
os.system('mkdir %s' % op_dir)
|
|
211
|
+
os.system('mkdir %s' % tmp_dir)
|
|
212
|
+
|
|
213
|
+
# compare trees with multiprocessing
|
|
214
|
+
pool = mp.Pool(processes=num_threads)
|
|
215
|
+
pool.map(compare_trees_worker, list_for_compare_trees_worker)
|
|
216
|
+
pool.close()
|
|
217
|
+
pool.join()
|
|
218
|
+
|
|
219
|
+
# get matrix
|
|
220
|
+
output_matrix_similarity = '%s/Matrix_similarity.txt' % op_dir
|
|
221
|
+
output_matrix_distance = '%s/Matrix_distance.txt' % op_dir
|
|
222
|
+
query_tree_list_basename = []
|
|
223
|
+
for each_q_tree in query_tree_list:
|
|
224
|
+
q_tree_path, q_tree_basename, q_tree_ext = sep_path_basename_ext(each_q_tree)
|
|
225
|
+
query_tree_list_basename.append(q_tree_basename)
|
|
226
|
+
|
|
227
|
+
subject_tree_list_basename = []
|
|
228
|
+
for each_s_tree in subject_tree_list:
|
|
229
|
+
s_tree_path, s_tree_basename, s_tree_ext = sep_path_basename_ext(each_s_tree)
|
|
230
|
+
subject_tree_list_basename.append(s_tree_basename)
|
|
231
|
+
|
|
232
|
+
get_matrix(sorted(query_tree_list_basename), sorted(subject_tree_list_basename), tmp_dir, export_dm, output_matrix_similarity, output_matrix_distance)
|
|
233
|
+
|
|
234
|
+
# final report
|
|
235
|
+
if export_dm is True:
|
|
236
|
+
print('Data matrix exported to: %s and %s' % (output_matrix_similarity, output_matrix_distance))
|
|
237
|
+
else:
|
|
238
|
+
print('Data matrix exported to: %s' % output_matrix_similarity)
|
|
239
|
+
|
|
240
|
+
print('Done!')
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
if __name__ == '__main__':
|
|
244
|
+
|
|
245
|
+
compare_trees_parser = argparse.ArgumentParser(usage=compare_trees_usage)
|
|
246
|
+
compare_trees_parser.add_argument('-o', required=True, help='output directory')
|
|
247
|
+
compare_trees_parser.add_argument('-t1', required=True, help='tree (folder) 1')
|
|
248
|
+
compare_trees_parser.add_argument('-t2', required=True, help='tree (folder) 2')
|
|
249
|
+
compare_trees_parser.add_argument('-tx', required=False, default='newick', help='extention of tree files, default: newick')
|
|
250
|
+
compare_trees_parser.add_argument('-dm', required=False, action="store_true", help='export distance-alike matrix, obtained by subtract the similarity value from 1')
|
|
251
|
+
compare_trees_parser.add_argument('-t', required=False, type=int, default=1, help='number of threads')
|
|
252
|
+
compare_trees_parser.add_argument('-tmp', required=False, action="store_true", help='keep tmp files')
|
|
253
|
+
compare_trees_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
254
|
+
args = vars(compare_trees_parser.parse_args())
|
|
255
|
+
compare_trees(args)
|
TreeSAK/dating.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import argparse
|
|
3
|
+
import itertools
|
|
4
|
+
from distutils.spawn import find_executable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
dating_usage = '''
|
|
8
|
+
============================ dating example commands ============================
|
|
9
|
+
|
|
10
|
+
# Requirement: PAML
|
|
11
|
+
|
|
12
|
+
TreeSAK dating -i gnm.tree -m msa.phy -p topo1 -o dating_wd -f -s parameter.txt
|
|
13
|
+
|
|
14
|
+
# parameter.txt file format (tab separated)
|
|
15
|
+
clock 2,3
|
|
16
|
+
nsample 50000
|
|
17
|
+
|
|
18
|
+
# assess dating results
|
|
19
|
+
ESS of at least 200 is commonly recommended, although ESS higher than 100 is
|
|
20
|
+
also often seen in literature.
|
|
21
|
+
|
|
22
|
+
=================================================================================
|
|
23
|
+
'''
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def check_dependencies(program_list):
|
|
27
|
+
|
|
28
|
+
not_detected_programs = []
|
|
29
|
+
for needed_program in program_list:
|
|
30
|
+
if find_executable(needed_program) is None:
|
|
31
|
+
not_detected_programs.append(needed_program)
|
|
32
|
+
|
|
33
|
+
if not_detected_programs != []:
|
|
34
|
+
print('%s not found, program exited!' % ','.join(not_detected_programs))
|
|
35
|
+
exit()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def sep_path_basename_ext(file_in):
|
|
39
|
+
|
|
40
|
+
f_path, f_name = os.path.split(file_in)
|
|
41
|
+
if f_path == '':
|
|
42
|
+
f_path = '.'
|
|
43
|
+
f_base, f_ext = os.path.splitext(f_name)
|
|
44
|
+
|
|
45
|
+
return f_name, f_path, f_base, f_ext[1:]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def prep_mcmctree_ctl(ctl_para_dict, mcmctree_ctl_file):
|
|
49
|
+
|
|
50
|
+
ctl_file_handle = open(mcmctree_ctl_file, 'w')
|
|
51
|
+
ctl_file_handle.write(' seed = %s\n' % ctl_para_dict.get('seed', '-1'))
|
|
52
|
+
ctl_file_handle.write(' seqfile = %s\n' % ctl_para_dict['seqfile'])
|
|
53
|
+
ctl_file_handle.write(' treefile = %s\n' % ctl_para_dict['treefile'])
|
|
54
|
+
ctl_file_handle.write(' mcmcfile = %s\n' % ctl_para_dict['mcmcfile'])
|
|
55
|
+
ctl_file_handle.write(' outfile = %s\n' % ctl_para_dict['outfile'])
|
|
56
|
+
ctl_file_handle.write(' ndata = %s\n' % ctl_para_dict.get('ndata', 1))
|
|
57
|
+
ctl_file_handle.write(' seqtype = %s * 0: nucleotides; 1:codons; 2:AAs\n' % ctl_para_dict['seqtype'])
|
|
58
|
+
ctl_file_handle.write(' usedata = %s * 0: no data; 1:seq like; 2:normal approximation; 3:out.BV (in.BV)\n' % ctl_para_dict['usedata'])
|
|
59
|
+
ctl_file_handle.write(' clock = %s * 1: global clock; 2: independent rates; 3: correlated rates\n' % ctl_para_dict.get('clock', 2))
|
|
60
|
+
ctl_file_handle.write(' RootAge = %s * safe constraint on root age, used if no fossil for root.\n' % ctl_para_dict.get('RootAge', '<1.0'))
|
|
61
|
+
ctl_file_handle.write(' model = %s * 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85\n' % ctl_para_dict.get('model', 0))
|
|
62
|
+
ctl_file_handle.write(' alpha = %s * alpha for gamma rates at sites\n' % ctl_para_dict.get('alpha', 0.5))
|
|
63
|
+
ctl_file_handle.write(' ncatG = %s * No. categories in discrete gamma\n' % ctl_para_dict.get('ncatG', 4))
|
|
64
|
+
ctl_file_handle.write(' cleandata = %s * remove sites with ambiguity data (1:yes, 0:no)?\n' % ctl_para_dict.get('cleandata', 0))
|
|
65
|
+
ctl_file_handle.write(' BDparas = %s * birth, death, sampling\n' % ctl_para_dict.get('BDparas', '1 1 0.1'))
|
|
66
|
+
ctl_file_handle.write(' kappa_gamma = %s * gamma prior for kappa\n' % ctl_para_dict.get('kappa_gamma', '6 2'))
|
|
67
|
+
ctl_file_handle.write(' alpha_gamma = %s * gamma prior for alpha\n' % ctl_para_dict.get('alpha_gamma', '1 1'))
|
|
68
|
+
ctl_file_handle.write(' rgene_gamma = %s * gammaDir prior for rate for genes\n' % ctl_para_dict.get('rgene_gamma', '1 50 1'))
|
|
69
|
+
ctl_file_handle.write(' sigma2_gamma = %s * gammaDir prior for sigma^2 (for clock=2 or 3)\n' % ctl_para_dict.get('sigma2_gamma', '1 10 1'))
|
|
70
|
+
ctl_file_handle.write(' finetune = %s * auto (0 or 1): times, musigma2, rates, mixing, paras, FossilErr\n' % ctl_para_dict.get('finetune', '1: .1 .1 .1 .1 .1 .1'))
|
|
71
|
+
ctl_file_handle.write(' print = %s * 0: no mcmc sample; 1: everything except branch rates 2: everything\n' % ctl_para_dict.get('print', 1))
|
|
72
|
+
ctl_file_handle.write(' burnin = %s\n' % ctl_para_dict.get('burnin', 50000))
|
|
73
|
+
ctl_file_handle.write(' sampfreq = %s\n' % ctl_para_dict.get('sampfreq', 50))
|
|
74
|
+
ctl_file_handle.write(' nsample = %s\n' % ctl_para_dict.get('nsample', 10000))
|
|
75
|
+
ctl_file_handle.close()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_parameter_combinations(para_to_test_dict):
|
|
79
|
+
|
|
80
|
+
para_lol_name = []
|
|
81
|
+
para_lol_value = []
|
|
82
|
+
para_lol_name_with_value = []
|
|
83
|
+
for each_para in sorted(list(para_to_test_dict.keys())):
|
|
84
|
+
para_setting_list_name = []
|
|
85
|
+
para_setting_list_value = []
|
|
86
|
+
para_setting_list_name_with_value = []
|
|
87
|
+
for each_setting in sorted(para_to_test_dict[each_para]):
|
|
88
|
+
name_str = ('%s%s' % (each_para, each_setting)).replace(' ', '_')
|
|
89
|
+
para_setting_list_name.append(each_para)
|
|
90
|
+
para_setting_list_value.append(each_setting)
|
|
91
|
+
para_setting_list_name_with_value.append(name_str)
|
|
92
|
+
para_lol_name.append(para_setting_list_name)
|
|
93
|
+
para_lol_value.append(para_setting_list_value)
|
|
94
|
+
para_lol_name_with_value.append(para_setting_list_name_with_value)
|
|
95
|
+
|
|
96
|
+
all_combination_list_name = [p for p in itertools.product(*para_lol_name)]
|
|
97
|
+
all_combination_list_value = [p for p in itertools.product(*para_lol_value)]
|
|
98
|
+
all_combination_list_name_with_value = [p for p in itertools.product(*para_lol_name_with_value)]
|
|
99
|
+
all_combination_list_name_with_value_str = ['_'.join(i) for i in all_combination_list_name_with_value]
|
|
100
|
+
|
|
101
|
+
para_dod = dict()
|
|
102
|
+
element_index = 0
|
|
103
|
+
for each_combination in all_combination_list_name_with_value_str:
|
|
104
|
+
current_name_list = all_combination_list_name[element_index]
|
|
105
|
+
current_value_list = all_combination_list_value[element_index]
|
|
106
|
+
current_para_dict = dict()
|
|
107
|
+
for key, value in zip(current_name_list, current_value_list):
|
|
108
|
+
current_para_dict[key] = value
|
|
109
|
+
para_dod[each_combination] = current_para_dict
|
|
110
|
+
element_index += 1
|
|
111
|
+
|
|
112
|
+
return para_dod
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def dating(args):
|
|
116
|
+
|
|
117
|
+
tree_file = args['i']
|
|
118
|
+
msa_file = args['m']
|
|
119
|
+
op_dir = args['o']
|
|
120
|
+
op_prefix = args['p']
|
|
121
|
+
seq_type = args['st']
|
|
122
|
+
settings_to_compare = args['s']
|
|
123
|
+
wrap_with_srun = args['srun']
|
|
124
|
+
force_overwrite = args['f']
|
|
125
|
+
|
|
126
|
+
check_dependencies(['mcmctree'])
|
|
127
|
+
|
|
128
|
+
para_to_test_dict = dict()
|
|
129
|
+
for each_para in open(settings_to_compare):
|
|
130
|
+
each_para_split = each_para.strip().split()
|
|
131
|
+
para_list = each_para_split[1].split(',')
|
|
132
|
+
para_to_test_dict[each_para_split[0]] = para_list
|
|
133
|
+
|
|
134
|
+
####################################################################################################################
|
|
135
|
+
|
|
136
|
+
current_pwd = os.getcwd()
|
|
137
|
+
|
|
138
|
+
tree_f_name, tree_f_path, tree_f_base, tree_f_ext = sep_path_basename_ext(tree_file)
|
|
139
|
+
msa_f_name, msa_f_path, msa_f_base, msa_f_ext = sep_path_basename_ext(msa_file)
|
|
140
|
+
|
|
141
|
+
get_bv_wd = '%s/get_bv_wd' % op_dir
|
|
142
|
+
mcmctree_ctl_bv = '%s/mcmctree.ctl' % get_bv_wd
|
|
143
|
+
get_BV_cmd_txt = '%s/get_BV_cmd.txt' % get_bv_wd
|
|
144
|
+
dating_cmds_txt = '%s/dating_cmds.txt' % op_dir
|
|
145
|
+
|
|
146
|
+
# create output folder
|
|
147
|
+
if os.path.isdir(op_dir) is True:
|
|
148
|
+
if force_overwrite is True:
|
|
149
|
+
os.system('rm -r %s' % op_dir)
|
|
150
|
+
else:
|
|
151
|
+
print('Output folder exist, program exited!')
|
|
152
|
+
exit()
|
|
153
|
+
|
|
154
|
+
os.system('mkdir %s' % op_dir)
|
|
155
|
+
|
|
156
|
+
############################################# write out step 1 command #############################################
|
|
157
|
+
|
|
158
|
+
# prepare files for getting bv file
|
|
159
|
+
os.system('mkdir %s' % get_bv_wd)
|
|
160
|
+
os.system('cp %s %s/' % (tree_file, get_bv_wd))
|
|
161
|
+
os.system('cp %s %s/' % (msa_file, get_bv_wd))
|
|
162
|
+
|
|
163
|
+
get_bv_para_dict = dict()
|
|
164
|
+
get_bv_para_dict['seqfile'] = msa_f_name
|
|
165
|
+
get_bv_para_dict['treefile'] = tree_f_name
|
|
166
|
+
get_bv_para_dict['mcmcfile'] = 'mcmc.txt'
|
|
167
|
+
get_bv_para_dict['outfile'] = 'out.txt'
|
|
168
|
+
get_bv_para_dict['seqtype'] = seq_type
|
|
169
|
+
get_bv_para_dict['usedata'] = '3'
|
|
170
|
+
|
|
171
|
+
prep_mcmctree_ctl(get_bv_para_dict, mcmctree_ctl_bv)
|
|
172
|
+
|
|
173
|
+
# write out get bv command
|
|
174
|
+
get_BV_cmd_txt_handle = open(get_BV_cmd_txt, 'w')
|
|
175
|
+
get_BV_cmd_txt_handle.write('mcmctree\n')
|
|
176
|
+
get_BV_cmd_txt_handle.close()
|
|
177
|
+
|
|
178
|
+
# run command to get bv file
|
|
179
|
+
print('Running step one command to get the BV file.')
|
|
180
|
+
os.chdir(get_bv_wd)
|
|
181
|
+
os.system('mcmctree > log.txt')
|
|
182
|
+
#os.system('touch out.BV')
|
|
183
|
+
print('Step one finished.')
|
|
184
|
+
os.chdir(current_pwd)
|
|
185
|
+
|
|
186
|
+
############################################# write out step 2 command #############################################
|
|
187
|
+
|
|
188
|
+
print('Preparing files for dating estimation')
|
|
189
|
+
|
|
190
|
+
para_comb_dict = get_parameter_combinations(para_to_test_dict)
|
|
191
|
+
print('para_comb_dict')
|
|
192
|
+
print(para_comb_dict)
|
|
193
|
+
|
|
194
|
+
dating_cmds_txt_handle = open(dating_cmds_txt, 'w')
|
|
195
|
+
for para_comb in sorted(list(para_comb_dict.keys())):
|
|
196
|
+
|
|
197
|
+
# create dir
|
|
198
|
+
current_dating_wd_1 = '%s/%s_run1' % (op_dir, para_comb)
|
|
199
|
+
current_dating_wd_2 = '%s/%s_run2' % (op_dir, para_comb)
|
|
200
|
+
os.system('mkdir %s' % current_dating_wd_1)
|
|
201
|
+
os.system('mkdir %s' % current_dating_wd_2)
|
|
202
|
+
|
|
203
|
+
# copy tree and msa file
|
|
204
|
+
os.system('cp %s %s/' % (tree_file, current_dating_wd_1))
|
|
205
|
+
os.system('cp %s %s/' % (tree_file, current_dating_wd_2))
|
|
206
|
+
os.system('cp %s %s/' % (msa_file, current_dating_wd_1))
|
|
207
|
+
os.system('cp %s %s/' % (msa_file, current_dating_wd_2))
|
|
208
|
+
|
|
209
|
+
# prepare mcmctree.ctl file
|
|
210
|
+
mcmctree_ctl_1 = '%s/mcmctree.ctl' % current_dating_wd_1
|
|
211
|
+
mcmctree_ctl_2 = '%s/mcmctree.ctl' % current_dating_wd_2
|
|
212
|
+
|
|
213
|
+
# run 1
|
|
214
|
+
current_para_dict_run1 = para_comb_dict[para_comb].copy()
|
|
215
|
+
current_para_dict_run1['seqfile'] = msa_f_name
|
|
216
|
+
current_para_dict_run1['treefile'] = tree_f_name
|
|
217
|
+
current_para_dict_run1['mcmcfile'] = '%s_%s_run1_mcmc.txt' % (op_prefix, para_comb)
|
|
218
|
+
current_para_dict_run1['outfile'] = '%s_%s_run1_out.txt' % (op_prefix, para_comb)
|
|
219
|
+
current_para_dict_run1['seqtype'] = seq_type
|
|
220
|
+
current_para_dict_run1['usedata'] = '2'
|
|
221
|
+
|
|
222
|
+
# run 2
|
|
223
|
+
current_para_dict_run2 = para_comb_dict[para_comb].copy()
|
|
224
|
+
current_para_dict_run2['seqfile'] = msa_f_name
|
|
225
|
+
current_para_dict_run2['treefile'] = tree_f_name
|
|
226
|
+
current_para_dict_run2['mcmcfile'] = '%s_%s_run2_mcmc.txt' % (op_prefix, para_comb)
|
|
227
|
+
current_para_dict_run2['outfile'] = '%s_%s_run2_out.txt' % (op_prefix, para_comb)
|
|
228
|
+
current_para_dict_run2['seqtype'] = seq_type
|
|
229
|
+
current_para_dict_run2['usedata'] = '2'
|
|
230
|
+
|
|
231
|
+
prep_mcmctree_ctl(current_para_dict_run1, mcmctree_ctl_1)
|
|
232
|
+
prep_mcmctree_ctl(current_para_dict_run2, mcmctree_ctl_2)
|
|
233
|
+
|
|
234
|
+
# copy BV files generated in step one
|
|
235
|
+
os.system('cp %s/out.BV %s/in.BV' % (get_bv_wd, current_dating_wd_1))
|
|
236
|
+
os.system('cp %s/out.BV %s/in.BV' % (get_bv_wd, current_dating_wd_2))
|
|
237
|
+
|
|
238
|
+
# write out commands
|
|
239
|
+
cmd_run_1 = 'cd %s/%s/%s; mcmctree' % (current_pwd, op_dir, current_dating_wd_1.split('/')[-1])
|
|
240
|
+
cmd_run_2 = 'cd %s/%s/%s; mcmctree' % (current_pwd, op_dir, current_dating_wd_2.split('/')[-1])
|
|
241
|
+
if wrap_with_srun is True:
|
|
242
|
+
cmd_run_1 = 'BioSAK srun -c "%s"' % cmd_run_1
|
|
243
|
+
cmd_run_2 = 'BioSAK srun -c "%s"' % cmd_run_2
|
|
244
|
+
dating_cmds_txt_handle.write(cmd_run_1 + '\n')
|
|
245
|
+
dating_cmds_txt_handle.write(cmd_run_2 + '\n')
|
|
246
|
+
dating_cmds_txt_handle.close()
|
|
247
|
+
|
|
248
|
+
print('Job script for performing dating exported to: %s' % dating_cmds_txt)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __name__ == '__main__':
|
|
252
|
+
|
|
253
|
+
dating_parser = argparse.ArgumentParser()
|
|
254
|
+
dating_parser.add_argument('-i', required=True, help='input tree file')
|
|
255
|
+
dating_parser.add_argument('-m', required=True, help='sequence alignments')
|
|
256
|
+
dating_parser.add_argument('-o', required=True, help='output directory')
|
|
257
|
+
dating_parser.add_argument('-p', required=True, help='output prefix')
|
|
258
|
+
dating_parser.add_argument('-s', required=True, help='settings to compare')
|
|
259
|
+
dating_parser.add_argument('-st', required=False, default='2', help='sequence type, 0 for nucleotides, 1 for codons, 2 for AAs, default: 2')
|
|
260
|
+
dating_parser.add_argument('-srun', required=False, action="store_true", help='wrap commands with BioSAK srun')
|
|
261
|
+
dating_parser.add_argument('-f', required=False, action="store_true", help='force overwrite')
|
|
262
|
+
args = vars(dating_parser.parse_args())
|
|
263
|
+
dating(args)
|
|
264
|
+
|